77 files changed, 12807 insertions, 0 deletions
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
new file mode 100644
index 0000000000..44d70cf503
--- /dev/null
+++ b/src/cpu/operators/CpuActivation.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuActivation.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/IOperator.h"
+#include "src/common/utils/LegacySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/cpu/CpuContext.h"
+#include "src/cpu/kernels/CpuActivationKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(input, output, activation_info);
+    auto k = std::make_unique<kernels::CpuActivationKernel>();
+    k->configure(input, output, activation_info);
+    _kernel = std::move(k);
+}
+
+Status
+CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+{
+    return kernels::CpuActivationKernel::validate(input, output, activation_info);
+}
+
+void CpuActivation::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    auto split_dimension = static_cast<kernels::CpuActivationKernel *>(_kernel.get())->get_split_dimension_hint();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
+
+std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor     &src,
+                                                                  const AclTensorDescriptor     &dst,
+                                                                  const AclActivationDescriptor &act,
+                                                                  bool                           is_validate)
+{
+    TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
+    TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
+    auto       info     = detail::convert_to_activation_info(act);
+
+    if (is_validate &&
+        !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    {
+        return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
+    }
+
+    auto act_op = std::make_unique<cpu::CpuActivation>();
+    act_op->configure(&src_info, &dst_info, info);
+
+    auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
+    if (op == nullptr)
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
+        return std::make_tuple(nullptr, StatusCode::OutOfMemory);
+    }
+    op->set_internal_operator(std::move(act_op));
+
+    return std::make_tuple(op, StatusCode::Success);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
new file mode 100644
index 0000000000..ec442f92c8
--- /dev/null
+++ b/src/cpu/operators/CpuActivation.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ACTIVATION_H
+#define ARM_COMPUTE_CPU_ACTIVATION_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuActivationKernel */
+class CpuActivation : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  input           Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out] output          Destination tensor info. Data type supported: same as @p src
+     * @param[in]  activation_info Activation layer parameters.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuActivation::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp
new file mode 100644
index 0000000000..53cd7fa1b7
--- /dev/null
+++ b/src/cpu/operators/CpuAdd.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuAdd.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuAdd::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info);
+    auto k = std::make_unique<kernels::CpuAddKernel>();
+    k->configure(src0, src1, dst, policy);
+    _kernel = std::move(k);
+}
+
+Status CpuAdd::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
+}
+
+void CpuAdd::run(ITensorPack &tensors)
+{
+    const auto split_dimension = static_cast<kernels::CpuAddKernel *>(_kernel.get())->get_split_dimension();
+
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h
new file mode 100644
index 0000000000..5f60102de2
--- /dev/null
+++ b/src/cpu/operators/CpuAdd.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ADD_H
+#define ARM_COMPUTE_CPU_ADD_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuAddKernel */
+class CpuAdd : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's input, dst and border mode.
+     *
+     * Valid configurations (src0,src1) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]  src0     First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  src1     Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] dst      The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in]  policy   Overflow policy.
+     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     *
+     */
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuAdd::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ADD_H */
diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp
new file mode 100644
index 0000000000..2f19f2f842
--- /dev/null
+++ b/src/cpu/operators/CpuAddMulAdd.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuAddMulAddKernel.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuAddMulAdd::configure(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             ITensorInfo               *add_output,
+                             ITensorInfo               *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+
+    auto k = std::make_unique<kernels::CpuAddMulAddKernel>();
+
+    const DataType data_type = input1->data_type();
+    if (is_data_type_quantized(data_type))
+    {
+        _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul);
+        _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add);
+
+        k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy,
+                     act_info);
+
+        // Save auxilary memory requirements after configuration
+        _aux_mem[DequantizedBnMul] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_mul.total_size());
+        _aux_mem[DequantizedBnAdd] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_add.total_size());
+    }
+    else
+    {
+        k->configure(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+    }
+
+    _kernel = std::move(k);
+}
+
+Status CpuAddMulAdd::validate(const ITensorInfo         *input1,
+                              const ITensorInfo         *input2,
+                              const ITensorInfo         *bn_mul,
+                              const ITensorInfo         *bn_add,
+                              const ITensorInfo         *add_output,
+                              const ITensorInfo         *final_output,
+                              ConvertPolicy              policy,
+                              const ActivationLayerInfo &act_info)
+{
+    const DataType data_type = input1->data_type();
+    if (is_data_type_quantized(data_type))
+    {
+        TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32);
+        TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add));
+
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add,
+                                                     add_output, final_output, policy, act_info);
+    }
+    else
+    {
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy,
+                                                     act_info);
+    }
+}
+
+void CpuAddMulAdd::run(ITensorPack &tensors)
+{
+    const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
+
+    if (is_data_type_quantized(data_type))
+    {
+        const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3);
+
+        CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors,
+                                                       true);
+        CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors,
+                                                       true);
+
+        ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul},
+                                           {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}};
+
+        ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add},
+                                           {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}};
+
+        _dequantize_bn_mul.run(dequantize_mul_pack);
+        _dequantize_bn_add.run(dequantize_add_pack);
+
+        ITensorPack add_mul_add_pack = {
+            {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)},
+            {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)},
+            {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()},
+            {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()},
+            {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)},
+            {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)},
+        };
+
+        NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack);
+    }
+    else
+    {
+        NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+    }
+}
+
+experimental::MemoryRequirements CpuAddMulAdd::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h
new file mode 100644
index 0000000000..47db75c37e
--- /dev/null
+++ b/src/cpu/operators/CpuAddMulAdd.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CPU_OPERATORS_CPUADDMULADD
+#define SRC_CPU_OPERATORS_CPUADDMULADD
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/operators/CpuDequantize.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuAddMulAddKernel */
+class CpuAddMulAdd : public ICpuOperator
+{
+public:
+    /** Initialize the operator's inputs and outputs.
+     *
+     * Similar to @ref NEAddMulAdd::configure()
+     *
+     */
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuAddMulAdd::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+    // We need auxilary memory to dequantize batchnorm coefficients
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        DequantizedBnMul = 0,
+        DequantizedBnAdd,
+        Count
+    };
+
+    CpuDequantize _dequantize_bn_mul{};
+    CpuDequantize _dequantize_bn_add{};
+
+    TensorInfo _dequantized_bn_mul{};
+    TensorInfo _dequantized_bn_add{};
+
+    experimental::MemoryRequirements _aux_mem{Count};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CPU_OPERATORS_CPUADDMULADD */
diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp
new file mode 100644
index 0000000000..55b9204d71
--- /dev/null
+++ b/src/cpu/operators/CpuCast.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuCast.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
+    auto k = std::make_unique<kernels::CpuCastKernel>();
+    k->configure(src, dst, policy);
+    _kernel = std::move(k);
+}
+
+Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    return kernels::CpuCastKernel::validate(src, dst, policy);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuCast.h b/src/cpu/operators/CpuCast.h
new file mode 100644
index 0000000000..1f4da6e2a0
--- /dev/null
+++ b/src/cpu/operators/CpuCast.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUCAST_H
+#define ACL_SRC_CPU_OPERATORS_CPUCAST_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuCastKernel */
+class CpuCast : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * Input data type must be different than output data type.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst                                             |
+     * |:--------------|:-----------------------------------------------|
+     * |QASYMM8_SIGNED | S16, S32, F32, F16                             |
+     * |QASYMM8        | U16, S16, S32, F32, F16                        |
+     * |U8             | U16, S16, S32, F32, F16                        |
+     * |U16            | U8, U32                                        |
+     * |S16            | QASYMM8_SIGNED, U8, S32                        |
+     * |F16            | QASYMM8_SIGNED, QASYMM8, F32, S32, U8          |
+     * |S32            | QASYMM8_SIGNED, QASYMM8, F16, F32, U8          |
+     * |F32            | QASYMM8_SIGNED, QASYMM8, F16, S32, U8|
+     * |S64            | F32                                            |
+     *
+     * @param[in]  src    The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/S64/F16/F32.
+     * @param[out] dst    The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy Conversion policy.
+     *
+     *
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuCast::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUCAST_H
diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp
new file mode 100644
index 0000000000..5f517a8fcb
--- /dev/null
+++ b/src/cpu/operators/CpuConcatenate.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_ERROR_ON(dst == nullptr);
+    ARM_COMPUTE_LOG_PARAMS(srcs_vector, dst, axis);
+
+    _axis     = axis;
+    _num_srcs = srcs_vector.size();
+
+    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis));
+
+    unsigned int offset = 0;
+
+    for (unsigned int i = 0; i < _num_srcs; ++i)
+    {
+        switch (axis)
+        {
+            case Window::DimX:
+            {
+                auto kernel = std::make_unique<kernels::CpuConcatenateWidthKernel>();
+                kernel->configure(srcs_vector.at(i), offset, dst);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            case Window::DimY:
+            {
+                auto kernel = std::make_unique<kernels::CpuConcatenateHeightKernel>();
+                kernel->configure(srcs_vector.at(i), offset, dst);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            case Window::DimZ:
+            {
+                auto kernel = std::make_unique<kernels::CpuConcatenateDepthKernel>();
+                kernel->configure(srcs_vector.at(i), offset, dst);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            case 3:
+            {
+                auto kernel = std::make_unique<kernels::CpuConcatenateBatchKernel>();
+                kernel->configure(srcs_vector.at(i), offset, dst);
+                _concat_kernels.emplace_back(std::move(kernel));
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Axis not supported");
+        }
+        offset += srcs_vector.at(i)->dimension(axis);
+    }
+}
+
+Status
+CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
+
+    unsigned int offset = 0;
+    for (const auto &src : srcs_vector)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+        switch (axis)
+        {
+            case Window::DimX:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst));
+                break;
+            }
+            case Window::DimY:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst));
+                break;
+            }
+            case Window::DimZ:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst));
+                break;
+            }
+            case 3:
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst));
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Axis not supported");
+        }
+        offset += src->dimension(axis);
+    }
+
+    if (dst->total_size() != 0)
+    {
+        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
+    }
+
+    return Status{};
+}
+
+void CpuConcatenate::run(ITensorPack &tensors)
+{
+    if (tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
+    {
+        ARM_COMPUTE_ERROR("Configured with different number of inputs");
+    }
+
+    int i = 0;
+    for (auto &k : _concat_kernels)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+        pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+        NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack);
+        ++i;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
new file mode 100644
index 0000000000..c36977c70f
--- /dev/null
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONCATENATE_H
+#define ARM_COMPUTE_CPU_CONCATENATE_H
+
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0).
+ * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1).
+ * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2).
+ * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3).
+ */
+class CpuConcatenate : public ICpuOperator
+{
+public:
+    CpuConcatenate() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel,
+     *       @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel.
+     *
+     * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]    dst         Output tensor. Data types supported: Same as @p srcs_vector.
+     * @param[in]     axis        Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     */
+    void configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuConcatenate::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
+    unsigned int                             _num_srcs{0};
+    unsigned int                             _axis{0};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
new file mode 100644
index 0000000000..26ca2ee783
--- /dev/null
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuConv2d.h"
+
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDirectConv2d.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuConv2d::CpuConv2d() : _function()
+{
+}
+
+CpuConv2d::~CpuConv2d() = default;
+
+void CpuConv2d::configure(ITensorInfo               *input,
+                          ITensorInfo               *weights,
+                          const ITensorInfo         *biases,
+                          ITensorInfo               *output,
+                          const PadStrideInfo       &conv_info,
+                          const WeightsInfo         &weights_info,
+                          const Size2D              &dilation,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math,
+                          unsigned int               num_groups)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
+                                                   act_info, enable_fast_math, num_groups));
+
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
+
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
+    {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            auto f = std::make_unique<CpuWinogradConv2d>();
+            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+            _function = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::GEMM:
+        {
+            auto f = std::make_unique<CpuGemmConv2d>();
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
+            _function = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::GEMM_CONV2D:
+        {
+            auto f = std::make_unique<CpuGemmDirectConv2d>();
+            f->configure(input, weights, biases, output, info);
+            _function = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::DIRECT:
+        {
+            auto f = std::make_unique<CpuDirectConv2d>();
+            f->configure(input, weights, biases, output, conv_info, act_info);
+            _function = std::move(f);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+
+    _aux_mem = _function->workspace();
+}
+
+Status CpuConv2d::validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info,
+                           const Size2D              &dilation,
+                           const ActivationLayerInfo &act_info,
+                           bool                       enable_fast_math,
+                           unsigned int               num_groups)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
+
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
+    {
+        case ConvolutionMethod::WINOGRAD:
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+            break;
+        case ConvolutionMethod::GEMM:
+            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info,
+                                                                dilation, act_info, enable_fast_math));
+            break;
+        case ConvolutionMethod::GEMM_CONV2D:
+            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
+            break;
+        case ConvolutionMethod::DIRECT:
+            ARM_COMPUTE_RETURN_ON_ERROR(CpuDirectConv2d::validate(input, weights, biases, output, conv_info, act_info));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+
+    return Status{};
+}
+
+ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info,
+                                                    const Size2D              &dilation,
+                                                    const ActivationLayerInfo &act_info,
+                                                    bool                       enable_fast_math)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
+    ARM_COMPUTE_UNUSED(weights_info);
+
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
+
+    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
+    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+    const std::vector<ConfigurationMethod> known_configs = {
+        // Alexnet
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U)),
+                            ConvolutionMethod::GEMM),
+        // VGG16 / VGG19
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U)),
+                            ConvolutionMethod::GEMM),
+        // Mobilenet 224
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM)};
+
+    const auto find_config = [&](ConfigurationMethod c)
+    {
+        const ConvolutionConfiguration config = c.first;
+        const PadStrideInfo            info   = std::get<3>(config);
+
+        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride();
+    };
+
+    std::vector<ConfigurationMethod>::const_iterator found;
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    {
+        return (*found).second;
+    }
+
+    if (dilation != Size2D(1U, 1U))
+    {
+        return ConvolutionMethod::GEMM;
+    }
+    else
+    {
+        const bool gemmDirectConv2d_validates =
+            bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info));
+
+        // SRGAN
+        // Output might not be initialized when it is an internal tensor of the layer using the convolution
+        if (input->total_size() > 1e7 && weights->dimension(idx_h) > 7)
+        {
+            // This configuration is memory demanding for GEMM method. GEMM_CONV2D which uses indirect convolution
+            // kernels underneath is the best option.
+            if (gemmDirectConv2d_validates)
+            {
+                return ConvolutionMethod::GEMM_CONV2D;
+            }
+            else if (bool(CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+            {
+                // NCHW data layout is not supported by GEMM_CONV2D
+                return ConvolutionMethod::DIRECT;
+            }
+        }
+        if (input->dimension(idx_c) < 16)
+        {
+            return ConvolutionMethod::GEMM;
+        }
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        // This heuristics only applies to F16 data type on A55r1
+        if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math &&
+            input->data_type() == DataType::F16)
+        {
+            // Exclude known bad winograd configs (and defaults to GEMM)
+            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = {
+                // Squeezenet_V1_1 fire2 and fire3
+                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
+                // Squeezenet_V1_1 fire6 and fire7
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
+                // Squeezenet_V1_1 fire8 and fire9
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
+            };
+            const auto find_conv_config = [&](ConvolutionConfiguration c)
+            {
+                const PadStrideInfo info = std::get<3>(c);
+
+                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+                       std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+                       std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+                       info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+                       info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+                       info.stride() == conv_info.stride();
+            };
+
+            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(),
+                                          known_bad_winograd_f16_with_fastmath_configs.end(),
+                                          find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end();
+            if (found_bad)
+            {
+                return ConvolutionMethod::GEMM;
+            }
+        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+        // For 1x1 convolutions run the default GEMM
+        if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+        {
+            return ConvolutionMethod::GEMM;
+        }
+
+        if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+        {
+            return ConvolutionMethod::WINOGRAD;
+        }
+        if (gemmDirectConv2d_validates)
+        {
+            return ConvolutionMethod::GEMM_CONV2D;
+        }
+        return ConvolutionMethod::GEMM;
+    }
+}
+
+void CpuConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+    _function->run(tensors);
+}
+
+void CpuConv2d::prepare(ITensorPack &tensors)
+{
+    _function->prepare(tensors);
+}
+
+experimental::MemoryRequirements CpuConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h
new file mode 100644
index 0000000000..71b9e15dc1
--- /dev/null
+++ b/src/cpu/operators/CpuConv2d.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to simulate a convolution layer. This function calls one of the following functions:
+ * -# @ref CpuGemm     (executed only in case GEMM is required for the operation)
+ * -# @ref CpuWinogradConv2d (executed only in case Winograd is required for the operation)
+ * -# @ref CpuDirectConv2d   (executed only in case Direct Convolution is required for the operation)
+ *
+ *
+ * The function selects one of the algorithms mentioned above based on:
+ *      - The size of the kernel
+ *      - Number of input/output feature maps
+ *      - Amount of memory needed
+ *
+ * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed.
+ *
+ * FP32 Algorithm| Filter Size                                        |   Input/Output feature maps               |
+ * --------------|----------------------------------------------------|-------------------------------------------|
+ * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7        |  Input channels is greater than 3         |
+ * FFT           | Squared kernels and greater than 9x9               |  Input feature maps > Output feature maps |
+ * DirectConv    | 9x9                                                |                                           |
+ * GEMM          | Any size                                           |                                           |
+ *
+ * Winograd 5x5 requires fast maths enabled.
+ *
+ * FP16 Algorithm| Filter Size      |
+ * --------------|------------------|
+ * Winograd      | Not supported    |
+ * FFT           | Not supported    |
+ * DirectConv    | 9x9              |
+ * GEMM          | Any size         |
+ *
+ *
+ */
+class CpuConv2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuConv2d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConv2d);
+    /** Default destructor */
+    ~CpuConv2d();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                              Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                              Data type supported: Same as @p src, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p src.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     */
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
+     *
+     * Similar to CpuConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
+    /** Static function to check if given info will return the convolution called by @ref CpuConv2d
+     *
+     * @param[in] src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p src.
+     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
+     *
+     * @return the Convolution Method Hint
+     */
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *src,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *dst,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    std::unique_ptr<ICpuOperator>    _function;
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000000..49e31926e3
--- /dev/null
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src,
+                                                ITensorInfo       *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
+    auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
+    k->configure(src, dst, original_src_shape, data_layout);
+    _kernel = std::move(k);
+}
+
+Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                 const ITensorInfo *dst,
+                                                 const TensorShape &original_src_shape,
+                                                 DataLayout         data_layout)
+{
+    return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
+}
+
+void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
+{
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
new file mode 100644
index 0000000000..e208cca3a0
--- /dev/null
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H
+#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */
+class CpuConvertFullyConnectedWeights : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  src                Source tensor to permute. Data types supported: All
+     * @param[out] dst                Destintation tensor. Data types supported: Same as @p src
+     * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
+     * @param[in]  data_layout        The data layout the weights have been trained in.
+     */
+    void
+    configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuConvertFullyConnectedWeights::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H */
diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp
new file mode 100644
index 0000000000..92c19d4df2
--- /dev/null
+++ b/src/cpu/operators/CpuCopy.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuCopy.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCopyKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::CpuCopyKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::CpuCopyKernel::validate(src, dst);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuCopy.h b/src/cpu/operators/CpuCopy.h
new file mode 100644
index 0000000000..9ffde4e781
--- /dev/null
+++ b/src/cpu/operators/CpuCopy.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_COPY_H
+#define ARM_COMPUTE_CPU_COPY_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuCopyKernel */
+class CpuCopy : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  src Source tensor info. Data type supported: All
+     * @param[out] dst Destination info. Data type supported: Same as @p src
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuCopy::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_COPY_H */
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..54075f2afa
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuDepthwiseConv2d.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+Status validate_arguments_optimized(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    if (!is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
+    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+                                    info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+                                    info.pad_stride_info.pad_bottom());
+
+    if (biases != nullptr)
+    {
+        const unsigned int channel_idx =
+            get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
+
+    // Validate Activation Layer
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+    }
+    return Status{};
+}
+} // namespace
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo           *src,
+                                                                        const ITensorInfo     *weights,
+                                                                        const ITensorInfo     *biases,
+                                                                        ITensorInfo           *dst,
+                                                                        const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
+
+    _is_quantized      = is_data_type_quantized_asymmetric(src->data_type());
+    _has_bias          = biases != nullptr;
+    _is_nchw           = src->data_layout() == DataLayout::NCHW;
+    _permute           = _is_nchw;
+    _is_prepared       = false;
+    _are_weights_const = weights->are_values_constant();
+
+    // Configure pipeline
+    _is_activationlayer_enabled =
+        info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
+
+    _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
+    if (_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+        _permute_output  = std::make_unique<cpu::CpuPermute>();
+
+        auto input_perm   = std::make_unique<TensorInfo>();
+        auto weights_perm = std::make_unique<TensorInfo>();
+        auto output_perm  = std::make_unique<TensorInfo>();
+
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+
+        output_perm->set_data_layout(DataLayout::NHWC);
+        output_perm->set_quantization_info(dst->quantization_info());
+
+        // Configure optimized depthwise
+        _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        output_perm->set_data_layout(DataLayout::NHWC);
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+    }
+    else
+    {
+        _dwc_optimized_func->configure(src, weights, biases, dst, info);
+    }
+
+    // Configure activation
+    if (_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo     *src,
+                                                                         const ITensorInfo     *weights,
+                                                                         const ITensorInfo     *biases,
+                                                                         const ITensorInfo     *dst,
+                                                                         const ConvolutionInfo &info)
+{
+    return validate_arguments_optimized(src, weights, biases, dst, info);
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    prepare(tensors);
+
+    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst            = tensors.get_tensor(TensorType::ACL_DST_0);
+    auto workspace      = tensors.get_tensor(TensorType::ACL_INT_3);
+    auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+    // Permute input
+    if (_permute)
+    {
+        ITensorPack pack;
+        auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+        auto        src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+    }
+
+    // Run assembly function
+    if (_is_nchw)
+    {
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst_perm);
+        _dwc_optimized_func->run(pack);
+    }
+    else
+    {
+        auto src     = tensors.get_tensor(TensorType::ACL_SRC_0);
+        auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
+        auto dst     = tensors.get_tensor(TensorType::ACL_DST);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, src);
+        pack.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack.add_tensor(TensorType::ACL_INT_0, workspace);
+        pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _dwc_optimized_func->run(pack);
+    }
+
+    // Permute output
+    if (_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    // Run activation
+    if (_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
+{
+    // if weights are not constant then we need to repack so that weights
+    // can be updated in-place
+    if (!_are_weights_const)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+        ITensorPack pack_opt;
+        pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+        pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+        // Prepare optimized function
+        _dwc_optimized_func->prepare(pack_opt);
+
+        return;
+    }
+
+    if (!_is_prepared)
+    {
+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+        // Permute weights
+        if (_permute)
+        {
+            auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, weights);
+            pack.add_tensor(TensorType::ACL_DST, permuted_weights);
+            _permute_weights->run(pack);
+
+            weights->mark_as_unused();
+
+            ITensorPack pack_opt;
+            pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+        else
+        {
+            ITensorPack pack_opt;
+            pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
+            pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+            pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+            // Prepare optimized function
+            _dwc_optimized_func->prepare(pack_opt);
+        }
+
+        _is_prepared = true;
+    }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo           *src,
+                                                              const ITensorInfo     *weights,
+                                                              const ITensorInfo     *biases,
+                                                              ITensorInfo           *dst,
+                                                              const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
+
+    _is_nchw     = src->data_layout() == DataLayout::NCHW;
+    _is_prepared = !_is_nchw;
+
+    ITensorInfo       *input_to_use   = src;
+    const ITensorInfo *weights_to_use = weights;
+    ITensorInfo       *output_to_use  = dst;
+
+    auto input_perm   = std::make_unique<TensorInfo>();
+    auto weights_perm = std::make_unique<TensorInfo>();
+    auto output_perm  = std::make_unique<TensorInfo>(
+        dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+
+    if (_is_nchw)
+    {
+        _permute_input   = std::make_unique<cpu::CpuPermute>();
+        _permute_weights = std::make_unique<cpu::CpuPermute>();
+
+        _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+        input_perm->set_data_layout(DataLayout::NHWC);
+        input_to_use = input_perm.get();
+
+        _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+        weights_perm->set_data_layout(DataLayout::NHWC);
+        weights_to_use = weights_perm.get();
+
+        output_to_use = output_perm.get();
+    }
+
+    _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
+    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
+
+    if (_is_nchw)
+    {
+        _permute_output = std::make_unique<cpu::CpuPermute>();
+        _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+        output_perm->set_data_layout(DataLayout::NHWC);
+    }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = info.act_info.enabled();
+    if (_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+        _activationlayer_function->configure(dst, nullptr, info.act_info);
+    }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo     *src,
+                                                               const ITensorInfo     *weights,
+                                                               const ITensorInfo     *biases,
+                                                               const ITensorInfo     *dst,
+                                                               const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    if (src->data_layout() == DataLayout::NCHW)
+    {
+        TensorShape permuted_input_shape   = src->tensor_shape();
+        TensorShape permuted_weights_shape = weights->tensor_shape();
+        TensorShape permuted_output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+        const TensorInfo permuted_input   = TensorInfo(src->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_input_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_weights_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(dst->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_output_shape)
+                                                           .set_data_layout(DataLayout::NCHW));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+    }
+
+    // Validate Activation Layer
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
+{
+    auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
+
+    if (_is_nchw)
+    {
+        prepare(tensors);
+        auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto dst_perm     = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, src);
+        pack.add_tensor(TensorType::ACL_DST, src_perm);
+        _permute_input->run(pack);
+
+        ITensorPack pack_depth;
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
+        pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
+    }
+    else
+    {
+        ITensorPack pack_depth;
+        pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
+        pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
+        pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+        pack_depth.add_tensor(TensorType::ACL_DST, dst);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
+    }
+
+    if (_is_nchw)
+    {
+        ITensorPack pack;
+        auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+        pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _permute_output->run(pack);
+    }
+
+    if (_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        ARM_COMPUTE_ERROR_ON(!weights->is_used());
+
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, weights);
+        pack.add_tensor(TensorType::ACL_DST, weights_perm);
+
+        _permute_weights->run(pack);
+        weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+
+void CpuDepthwiseConv2d::configure(ITensorInfo           *src,
+                                   const ITensorInfo     *weights,
+                                   const ITensorInfo     *biases,
+                                   ITensorInfo           *dst,
+                                   const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
+
+    _depth_conv_func =
+        get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+    switch (_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.configure(src, weights, biases, dst, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.configure(src, weights, biases, dst, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+Status CpuDepthwiseConv2d::validate(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
+{
+    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
+    switch (depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+    }
+}
+
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                                   const ITensorInfo     *weights,
+                                                                                   const ITensorInfo     *biases,
+                                                                                   const ITensorInfo     *dst,
+                                                                                   const ConvolutionInfo &info)
+{
+    if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+    {
+        return DepthwiseConvolutionFunction::OPTIMIZED;
+    }
+    else
+    {
+        return DepthwiseConvolutionFunction::GENERIC;
+    }
+}
+
+void CpuDepthwiseConv2d::run(ITensorPack &tensors)
+{
+    switch (_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.run(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.run(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+
+void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
+{
+    switch (_depth_conv_func)
+    {
+        case DepthwiseConvolutionFunction::OPTIMIZED:
+            _func_optimized.prepare(tensors);
+            break;
+        case DepthwiseConvolutionFunction::GENERIC:
+            _func_generic.prepare(tensors);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
new file mode 100644
index 0000000000..7eaa0df857
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2d.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+#include "src/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to execute a depthwise convolution.
+ */
+class CpuDepthwiseConv2d : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuDepthwiseConv2d() = default;
+    /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+     * @param[in]      weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
+     *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]      info    Depthwise convolution meta-data.
+     */
+    void configure(ITensorInfo           *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
+    /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
+     *
+     * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+     *                    Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                    Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] dst     Destination tensor. Data type supported: same as @p src.
+     * @param[in] info    Depthwise convolution meta-data.
+     *
+     * @return a Depthwise Convolution Function
+     */
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                          const ITensorInfo     *weights,
+                                                                          const ITensorInfo     *biases,
+                                                                          const ITensorInfo     *dst,
+                                                                          const ConvolutionInfo &info);
+
+    // Inherited methods overriden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+
+private:
+    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
+    *
+    * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+    *
+    * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+    * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
+    * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref CpuActivation if fused activation is required
+    *
+    */
+    class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dOptimizedInternal() = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dOptimizedInternal() = default;
+        /** Initialize the function's source, destination, kernels and border_size.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
+
+        // Inherited methods overriden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                      _activationlayer_function{nullptr};
+        bool                                                _has_bias{false};
+        bool                                                _is_quantized{false};
+        bool                                                _is_nchw{true};
+        bool                                                _permute{false};
+        bool                                                _is_activationlayer_enabled{false};
+        bool                                                _is_prepared{false};
+        bool                                                _are_weights_const{true};
+    };
+
+    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
+     *
+     * -# @ref CpuDepthwiseConv2dNativeKernel
+     *
+     */
+    class CpuDepthwiseConv2dGeneric : public ICpuOperator
+    {
+    public:
+        /** Default constructor */
+        CpuDepthwiseConv2dGeneric() = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move constructor */
+        CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Prevent instances of this class from being copied (As this class contains pointers) */
+        CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete;
+        /** Default move assignment operator */
+        CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default;
+        /** Default destructor */
+        ~CpuDepthwiseConv2dGeneric() = default;
+        /** Initialize the function's source, destination, weights and convolution information.
+         *
+         * @param[in, out] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+         * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
+         * @param[in]      weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+         *                         Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      biases  Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+         *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+         * @param[in]      info    Depthwise convolution meta-data.
+         */
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
+
+        /** Static function to check if given info will lead to a valid configuration
+         *
+         * Similar to CpuDepthwiseConv2dGeneric::configure()
+         *
+         * @return a status
+         */
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
+
+        // Inherited methods overridden:
+        void run(ITensorPack &tensors) override;
+        void prepare(ITensorPack &tensors) override;
+
+    private:
+        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{nullptr};
+        bool                                                     _is_nchw{true};
+        bool                                                     _is_prepared{false};
+        bool                                                     _is_activationlayer_enabled{false};
+    };
+
+    DepthwiseConvolutionFunction        _depth_conv_func{DepthwiseConvolutionFunction::GENERIC};
+    CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
+    CpuDepthwiseConv2dGeneric           _func_generic{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H */
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
new file mode 100644
index 0000000000..7fe9011da1
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/AssemblyUtils.h"
+#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
+{
+    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr};
+    bool                                                              is_prepared{false};
+    bool                                                              are_weights_const{true};
+    experimental::MemoryRequirements                                  mem_req{};
+};
+
+#ifndef DOXYGEN_SKIP_THIS
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>())
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default;
+
+void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
+                                                   const ITensorInfo     *weights,
+                                                   const ITensorInfo     *bias,
+                                                   ITensorInfo           *dst,
+                                                   const ConvolutionInfo &info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
+    _pImpl->is_prepared            = false;
+    _pImpl->are_weights_const      = weights->are_values_constant();
+
+    // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+    if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+    {
+        return;
+    }
+
+    auto dwc_wrapper = std::make_unique<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel>();
+    ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr);
+    dwc_wrapper->configure(src, weights, bias, dst, info, ci);
+
+    // Compute memory requirements for assembly kernels
+    constexpr size_t alignment = 4096;
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment});
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment});
+    _pImpl->asm_kernel = std::move(dwc_wrapper);
+}
+
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
+                                                    const ITensorInfo     *weights,
+                                                    const ITensorInfo     *bias,
+                                                    const ITensorInfo     *dst,
+                                                    const ConvolutionInfo &info)
+{
+    return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
+}
+
+experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
+{
+    return _pImpl->mem_req;
+}
+
+bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
+{
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
+    return act.type != arm_gemm::Activation::Type::None;
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    prepare(tensors);
+
+    // Split over rows (z) if there's more than 1, otherwise batches (w). This logic
+    // corresponds to the threading strategy in DepthFirstDriver::execute_internal
+    auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) != 1 ? Window::DimZ : Window::DimW;
+
+    NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors);
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
+{
+    const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+
+    if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
+    {
+        // Pack weights and bias
+        const ITensor *bias    = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        ITensor       *storage = tensors.get_tensor(TensorType::ACL_INT_1);
+
+        const auto weights_ptr    = weights->buffer() + weights->info()->offset_first_element_in_bytes();
+        const auto bias_ptr       = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr;
+        auto       parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
+
+        const auto weights_shape   = weights->info()->tensor_shape();
+        const auto weights_padding = weights->info()->padding();
+
+        const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
+        const size_t ld_weights_row =
+            ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+        _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
+
+        weights->mark_as_unused();
+        if (bias != nullptr)
+        {
+            bias->mark_as_unused();
+        }
+        _pImpl->is_prepared = true;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
new file mode 100644
index 0000000000..f1816625d2
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+struct ConvolutionInfo;
+
+namespace cpu
+{
+/** Depthwise convolution assembly kernel glue */
+class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator
+{
+public:
+    CpuDepthwiseConv2dAssemblyDispatch();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
+    ~CpuDepthwiseConv2dAssemblyDispatch();
+    /** Initialize the function's source, destination, kernels and border_size.
+     *
+     * @note Supports only NHWC format
+     *
+     * @param[in]  src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights Weights tensor info. These are 3D tensors with shape [W, H, IFM].
+     *                     Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  bias    (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                     Data type supported: same as @p src or S32 if @p src is quantized.
+     * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
+     * @param[in]  info    Depthwise convolution meta-data.
+     */
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
+    /** Checks if activation is supported by the assembly kernels
+     *
+     * @param[in] activation Activation to check
+     *
+     * @return True if activation is supported else false
+     */
+    static bool is_activation_supported(const ActivationLayerInfo &activation);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    struct LocalImpl;
+    std::unique_ptr<LocalImpl> _pImpl;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */
diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp
new file mode 100644
index 0000000000..c05a23f3a7
--- /dev/null
+++ b/src/cpu/operators/CpuDequantize.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuDequantize.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuDequantizeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::CpuDequantizeKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::CpuDequantizeKernel::validate(src, dst);
+}
+
+void CpuDequantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    prepare(tensors);
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDequantize.h b/src/cpu/operators/CpuDequantize.h
new file mode 100644
index 0000000000..dbfc0c612a
--- /dev/null
+++ b/src/cpu/operators/CpuDequantize.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H
+#define ARM_COMPUTE_CPU_DEQUANTIZE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */
+class CpuDequantize : public ICpuOperator
+{
+public:
+    /** Configure the kernel.
+     *
+     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuDequantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */
diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp
new file mode 100644
index 0000000000..135a3bb2b9
--- /dev/null
+++ b/src/cpu/operators/CpuDirectConv2d.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuDirectConv2d.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuDirectConv2d::~CpuDirectConv2d() = default;
+
+CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _output_stage_kernel(),
+      _conv_kernel(),
+      _input_border_handler(),
+      _activationlayer_function(),
+      _accumulator(),
+      _has_bias(false),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ),
+      _is_padding_required()
+{
+}
+
+void CpuDirectConv2d::configure(ITensorInfo               *src,
+                                ITensorInfo               *weights,
+                                const ITensorInfo         *bias,
+                                ITensorInfo               *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info);
+
+    _output_stage_kernel  = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>();
+    _conv_kernel          = std::make_unique<kernels::CpuDirectConv2dKernel>();
+    _input_border_handler = std::make_unique<NEFillBorderKernel>();
+
+    // Free accumulator
+    if (_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
+    // Check if bias should be added in the convolution result
+    _has_bias = (bias != nullptr);
+
+    _conv_kernel->configure(src, weights, dst, conv_info);
+    if (_has_bias)
+    {
+        _output_stage_kernel->configure(dst, bias);
+    }
+    _is_padding_required = !_conv_kernel->border_size().empty();
+
+    if (_is_padding_required)
+    {
+        // Add zero padding XY
+        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT,
+                                         PixelValue(static_cast<float>(0.f)));
+    }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if (_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<CpuActivation>();
+        _activationlayer_function->configure(dst, dst, act_info);
+    }
+}
+
+Status CpuDirectConv2d::validate(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *bias,
+                                 const ITensorInfo         *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+
+    // output might not be initialized since it can be an intermediate tensor of another layer
+    DataType   data_type = src->data_type();
+    TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+    // Validate Convolution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of input feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // Validate bias kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
+
+    if (act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDirectConv2d::run(ITensorPack &tensors)
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    if (_is_padding_required)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_DST, src);
+        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(),
+                                       pack);
+    }
+    NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
+    if (_has_bias)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC_0, dst);
+        pack.add_tensor(TensorType::ACL_SRC_1, bias);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
+    }
+
+    if (_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
new file mode 100644
index 0000000000..73c85f2dcd
--- /dev/null
+++ b/src/cpu/operators/CpuDirectConv2d.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
+#include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref kernels::CpuDirectConv2dOutputStageKernel
+ * -# @ref kernels::CpuDirectConv2dKernel
+ */
+class CpuDirectConv2d : public ICpuOperator
+{
+public:
+    CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    ~CpuDirectConv2d();
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+     *
+     * @param[in, out] src       Input tensor info. Data types supported: F16/F32.
+     * @param[in]      weights   Set of kernels to convolve the input volume.
+     *                           Supported sizes: 1x1, 3x3 and 5x5.
+     *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                           Data type supported: Same as @p src.
+     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p src.
+     * @param[out]     dst       Output tensor info.
+     *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *bias,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    MemoryGroup                                                _memory_group;
+    std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel;
+    std::unique_ptr<kernels::CpuDirectConv2dKernel>            _conv_kernel;
+    std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
+    std::unique_ptr<CpuActivation>                             _activationlayer_function;
+    Tensor                                                     _accumulator;
+    bool                                                       _has_bias{false};
+    bool                                                       _is_activationlayer_enabled{false};
+    unsigned int                                               _dim_split{0};
+    bool                                                       _is_padding_required{false};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp
new file mode 100644
index 0000000000..626f1c6775
--- /dev/null
+++ b/src/cpu/operators/CpuDirectConv3d.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuDirectConv3d.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuDirectConv3d::~CpuDirectConv3d() = default;
+
+CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _conv_kernel(),
+      _activationlayer_function(),
+      _accumulator(),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ)
+{
+}
+
+void CpuDirectConv3d::configure(
+    ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info);
+    ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
+
+    _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>();
+
+    // Free accumulator
+    if (_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    _dim_split = Window::DimY;
+
+    _conv_kernel->configure(src0, src1, src2, dst, conv_info);
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = conv_info.act_info.enabled();
+    if (_is_activationlayer_enabled)
+    {
+        _activationlayer_function = std::make_unique<CpuActivation>();
+        _activationlayer_function->configure(dst, dst, conv_info.act_info);
+    }
+}
+
+Status CpuDirectConv3d::validate(const ITensorInfo *src0,
+                                 const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 const Conv3dInfo   conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    // Validate Convolution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info));
+
+    if (conv_info.act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info));
+    }
+
+    return Status{};
+}
+
+void CpuDirectConv3d::run(ITensorPack &tensors)
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
+
+    if (_is_activationlayer_enabled)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, dst);
+        pack.add_tensor(TensorType::ACL_DST, dst);
+        _activationlayer_function->run(pack);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h
new file mode 100644
index 0000000000..3ad1e09a14
--- /dev/null
+++ b/src/cpu/operators/CpuDirectConv3d.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H
+#define ARM_COMPUTE_CPU_DIRECTCONV3D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuDirectConv3dKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuDirectConv3dKernel
+ */
+class CpuDirectConv3d : public ICpuOperator
+{
+public:
+    CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    ~CpuDirectConv3d();
+    /** Set the input, weights, biases and output tensor info.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in, out] src0      Input tensor info.
+     * @param[in]      src1      Set of kernels to convolve the input volume.
+     *                           The 2nd dimension must be the same as the src0's volume 1st dimension.
+     * @param[in]      src2      Set of biases. Can be nullptr.
+     * @param[out]     dst       Output tensor info.
+     *                           The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor.
+     * @param[in]      conv_info Contains padding, stride, acitvation information.
+     */
+    void configure(
+        ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuDirectConv3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo   conv_info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    MemoryGroup                                     _memory_group;
+    std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel;
+    std::unique_ptr<CpuActivation>                  _activationlayer_function;
+    Tensor                                          _accumulator;
+    bool                                            _is_activationlayer_enabled{false};
+    unsigned int                                    _dim_split{0};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DIRECTCONV3D_H */
diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp
new file mode 100644
index 0000000000..c2ae8773c6
--- /dev/null
+++ b/src/cpu/operators/CpuElementwise.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuElementwise.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/CpuElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuElementwiseBase::run(ITensorPack &tensors)
+{
+    // If the kernel has been configured, use the window from the kernel.
+    if (_kernel->is_window_configured())
+    {
+        ICpuOperator::run(tensors);
+        return;
+    }
+
+    auto src0_info        = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info();
+    auto src1_info        = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info();
+    auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape());
+    ICpuOperator::run(tensors, shape_and_window.second);
+}
+
+template <ArithmeticOperation op>
+void CpuElementwiseArithmetic<op>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
+    auto k = std::make_unique<kernels::CpuArithmeticKernel>();
+    k->configure(op, src0, src1, dst);
+    _kernel = std::move(k);
+}
+
+template <ArithmeticOperation op>
+Status CpuElementwiseArithmetic<op>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst);
+}
+
+template class CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
+template class CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
+template class CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
+template class CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
+
+void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
+    auto k = std::make_unique<kernels::CpuDivisionKernel>();
+    k->configure(src0, src1, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    return kernels::CpuDivisionKernel::validate(src0, src1, dst);
+}
+
+void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
+    auto k = std::make_unique<kernels::CpuPowerKernel>();
+    k->configure(src0, src1, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    return kernels::CpuPowerKernel::validate(src0, src1, dst);
+}
+
+template <ComparisonOperation COP>
+void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
+    auto k = std::make_unique<kernels::CpuComparisonKernel>();
+    k->configure(COP, src0, src1, dst);
+    _kernel = std::move(k);
+}
+
+template <ComparisonOperation COP>
+Status
+CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+    return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
+}
+
+void CpuElementwiseComparison::configure(const ITensorInfo  *src0,
+                                         const ITensorInfo  *src1,
+                                         ITensorInfo        *dst,
+                                         ComparisonOperation op)
+{
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
+    auto k = std::make_unique<kernels::CpuComparisonKernel>();
+    k->configure(op, src0, src1, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseComparison::validate(const ITensorInfo  *src0,
+                                          const ITensorInfo  *src1,
+                                          const ITensorInfo  *dst,
+                                          ComparisonOperation op)
+{
+    return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
+}
+
+// Supported Specializations
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h
new file mode 100644
index 0000000000..5db53c8026
--- /dev/null
+++ b/src/cpu/operators/CpuElementwise.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuElementwiseBase : public ICpuOperator
+{
+public:
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power
+ *
+ * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32
+ * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32.
+ */
+template <ArithmeticOperation op>
+class CpuElementwiseArithmetic : public CpuElementwiseBase
+{
+public:
+    /** Configure the operator
+     *
+     * @param[in]  src0 The first source tensor information.
+     * @param[in]  src1 The second source tensor information. With PRelu, this is used as alpha tensor.
+     * @param[out] dst  The output tensor information.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuElementwiseArithmetic::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */
+using CpuElementwiseMax = CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */
+using CpuElementwiseMin = CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */
+using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
+ *
+ * @note The tensor data type for the inputs must be S32/F16/F32.
+ * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
+ */
+class CpuElementwiseDivision : public CpuElementwiseBase
+{
+public:
+    /** Initialise the kernel's inputs, dst and conversion policy.
+     *
+     * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuElementwiseDivision::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ * @note For an exponent that is a float, this function will only work with a positive base.
+ */
+class CpuElementwisePower : public CpuElementwiseBase
+{
+public:
+    /** Initialise the kernel's inputs, dst and conversion policy.
+     *
+     * @param[in, out] src0 First tensor input info. Data types supported: F16/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: Same as @p src0.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuElementwisePower::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+class CpuElementwiseComparison : public CpuElementwiseBase
+{
+public:
+    /** Initialise the kernel's inputs, dst and conversion policy.
+     *
+     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: U16/U32.
+     * @param[in]      op   Comparison Operation to be performed.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuElementwiseComparison::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+template <ComparisonOperation op>
+class CpuElementwiseComparisonStatic : public CpuElementwiseBase
+{
+public:
+    /** Initialise the kernel's inputs, dst and conversion policy.
+     *
+     * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+     * @param[out]     dst  Output tensor info. Data types supported: U16/U32.
+     */
+    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuElementwiseComparisonStatic::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Basic function to run equal comparison. */
+using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
+/** Basic function to run not equal comparison. */
+using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+/** Basic function to run greater comparison. */
+using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
+/** Basic function to run greater-equal comparison. */
+using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+/** Basic function to run less comparison. */
+using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
+/** Basic function to run less-equal comparison. */
+using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp
new file mode 100644
index 0000000000..04ab7bf8f5
--- /dev/null
+++ b/src/cpu/operators/CpuElementwiseUnary.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuElementwiseUnary.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+using KernelType = kernels::CpuElementwiseUnaryKernel;
+
+void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(op, src, dst);
+    auto k = std::make_unique<KernelType>();
+    k->configure(op, src, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
+{
+    return KernelType::validate(op, src, dst);
+}
+
+void CpuElementwiseUnary::run(ITensorPack &tensors)
+{
+    if (_kernel->is_window_configured())
+    {
+        ICpuOperator::run(tensors);
+        return;
+    }
+
+    auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info();
+    ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h
new file mode 100644
index 0000000000..1e51bfaa1c
--- /dev/null
+++ b/src/cpu/operators/CpuElementwiseUnary.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuElementwiseUnary : public ICpuOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  op  Unary operation to execute
+     * @param[in]  src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[out] dst Output tensor information. Data types supported: Same as @p src.
+     */
+    void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuElementwiseUnary::configure()
+     *
+     * @return a status
+     */
+    static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp
new file mode 100644
index 0000000000..1890d0b916
--- /dev/null
+++ b/src/cpu/operators/CpuFill.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFill.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFillKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value)
+{
+    ARM_COMPUTE_LOG_PARAMS(tensor, constant_value);
+    auto k = std::make_unique<kernels::CpuFillKernel>();
+    k->configure(tensor, constant_value);
+    _kernel = std::move(k);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h
new file mode 100644
index 0000000000..cb83745d29
--- /dev/null
+++ b/src/cpu/operators/CpuFill.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021,2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FILL_H
+#define ARM_COMPUTE_CPU_FILL_H
+
+#include "arm_compute/core/PixelValue.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuFillKernel */
+class CpuFill : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in,out] tensor         Tensor to fill. Supported data types: All
+     * @param[in]     constant_value The value used to fill the planes of the tensor
+     */
+    void configure(const ITensorInfo *tensor, PixelValue constant_value);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FILL_H */
diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
new file mode 100644
index 0000000000..2609d44590
--- /dev/null
+++ b/src/cpu/operators/CpuFlatten.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFlatten.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuReshape.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuFlatten::CpuFlatten() : _reshape(nullptr)
+{
+}
+
+CpuFlatten::~CpuFlatten() = default;
+
+void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    _reshape = std::make_unique<CpuReshape>();
+    _reshape->configure(src, dst);
+}
+
+Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return CpuReshape::validate(src, dst);
+}
+
+void CpuFlatten::run(ITensorPack &tensors)
+{
+    _reshape->run(tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.h b/src/cpu/operators/CpuFlatten.h
new file mode 100644
index 0000000000..911760dd95
--- /dev/null
+++ b/src/cpu/operators/CpuFlatten.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
+#define ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuReshape;
+/** Basic function to flatten a given input */
+class CpuFlatten : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuFlatten();
+    /** Destructor */
+    ~CpuFlatten();
+    /** Configure operator for a given list of arguments
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     * @param[in] src Source tensor to flatten with at least 3 dimensions.
+     *                The dimensions above the third will be interpreted as batches. Data types supported: All
+     * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where:
+     *                            w = width input tensor, h = height input tensor and d = depth input tensor.
+     *                            Data type supported: same as @p src
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuFlatten::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<CpuReshape> _reshape;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp
new file mode 100644
index 0000000000..a107393b01
--- /dev/null
+++ b/src/cpu/operators/CpuFloor.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFloor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::CpuFloorKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::CpuFloorKernel::validate(src, dst);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFloor.h b/src/cpu/operators/CpuFloor.h
new file mode 100644
index 0000000000..6082f98867
--- /dev/null
+++ b/src/cpu/operators/CpuFloor.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FLOOR_H
+#define ARM_COMPUTE_CPU_FLOOR_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuFloorKernel */
+class CpuFloor : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in] src Source tensor info. Data types supported: F16/F32.
+     * @param[in] dst Destination tensor info. Data type supported: same as @p src
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuFloor::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FLOOR_H */
diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
new file mode 100644
index 0000000000..85a0b0311b
--- /dev/null
+++ b/src/cpu/operators/CpuFullyConnected.cpp
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFullyConnected.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/utils/quantization/AsymmHelpers.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
+#include "src/cpu/operators/CpuFlatten.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
+{
+    const auto                    data_type = src->data_type();
+    const QuantizationInfo        oq_info   = dst->quantization_info();
+    const UniformQuantizationInfo iq_unif   = src->quantization_info().uniform();
+    const UniformQuantizationInfo wq_unif   = weights->quantization_info().uniform();
+    const UniformQuantizationInfo oq_unif   = oq_info.uniform();
+
+    float   multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
+    int32_t output_multiplier;
+    int32_t output_shift;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
+    std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
+
+    gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
+    gemmlowp_output_stage_info.gemmlowp_shift      = output_shift;
+    gemmlowp_output_stage_info.gemmlowp_offset     = oq_unif.offset;
+    gemmlowp_output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage_info.gemmlowp_min_bound  = type_min;
+    gemmlowp_output_stage_info.gemmlowp_max_bound  = type_max;
+
+    return Status{};
+}
+
+Status validate_mm(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   const ITensorInfo         *dst,
+                   const ActivationLayerInfo &act,
+                   bool                       enable_fast_math,
+                   WeightFormat               weight_format)
+{
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate src and weights offset
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
+
+        GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
+        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
+
+        GEMMInfo gemm_info;
+        gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
+        gemm_info.set_fast_math(enable_fast_math);
+
+        // Validate gemmlowp function
+        TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
+        TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info));
+    }
+    else
+    {
+        GEMMInfo gemm_info;
+        gemm_info.set_weight_format(weight_format);
+        gemm_info.set_fixed_format(weight_format != WeightFormat::UNSPECIFIED);
+        gemm_info.set_fast_math(enable_fast_math);
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, gemm_info));
+    }
+
+    return Status{};
+}
+} // namespace
+
+CpuFullyConnected::CpuFullyConnected()
+    : _flatten(nullptr),
+      _convert_weights(nullptr),
+      _transpose_weights(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _flattened_src(),
+      _converted_weights(),
+      _reshaped_weights(),
+      _trans_weights(),
+      _trans_weights_idx(AuxTensorIdx::Count),
+      _aux_mem(Count),
+      _needs_weights_conversion(false),
+      _needs_weights_reshape(false),
+      _is_fc_after_conv(false),
+      _is_quantized_asymmetric(false),
+      _is_prepared(false),
+      _enable_fast_math(false),
+      _fixed_format(false),
+      _weight_format(arm_compute::WeightFormat::UNSPECIFIED),
+      _dynamic_weights(false)
+{
+}
+
+CpuFullyConnected::~CpuFullyConnected() = default;
+
+void CpuFullyConnected::configure_mm(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const ITensorInfo         *biases,
+                                     ITensorInfo               *dst,
+                                     const ActivationLayerInfo &act)
+{
+    if (_is_quantized_asymmetric)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate src and weights offset
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
+
+        TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
+        TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
+
+        // Configure gemmlowp function and output stage for asymmetric quantized types
+        GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
+        const Status            status =
+            get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
+        ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
+
+        GEMMInfo gemm_info;
+        gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
+        gemm_info.set_activation_info(act);
+        gemm_info.set_fast_math(_enable_fast_math);
+        _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
+        _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info);
+    }
+    else
+    {
+        // Configure matrix multiply kernel
+        GEMMInfo gemm_info;
+        gemm_info.set_activation_info(act);
+        gemm_info.set_fast_math(_enable_fast_math);
+        gemm_info.set_fixed_format(_fixed_format);
+        gemm_info.set_weight_format(_weight_format);
+        _mm_gemm = std::make_unique<CpuGemm>();
+        _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info);
+    }
+}
+
+void CpuFullyConnected::configure_conv_fc(const ITensorInfo         *src,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          ITensorInfo               *dst,
+                                          const ActivationLayerInfo &act)
+{
+    ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+    // If the fully connected layer is called after a convolution layer, the src tensor must be linearized
+
+    // Initialize output tensor for flatten
+    auto_init_if_empty(_flattened_src, src->clone()->set_tensor_shape(compute_flatten_shape(src)));
+
+    _flatten = std::make_unique<CpuFlatten>();
+    _flatten->configure(src, &_flattened_src);
+
+    // Configure matrix multiply kernel
+    configure_mm(&_flattened_src, weights, biases, dst, act);
+}
+
+void CpuFullyConnected::configure_fc_fc(const ITensorInfo         *src,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        ITensorInfo               *dst,
+                                        const ActivationLayerInfo &act)
+{
+    ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
+
+    // Configure matrix multiply kernel
+    configure_mm(src, weights, biases, dst, act);
+}
+
+void CpuFullyConnected::configure(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  ITensorInfo            *dst,
+                                  FullyConnectedLayerInfo fc_info,
+                                  const WeightsInfo      &weights_info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
+
+    _needs_weights_conversion = false;
+    _needs_weights_reshape    = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    _needs_weights_reshape    = _needs_weights_reshape && !fc_info.retain_internal_weights;
+    _is_fc_after_conv         = true;
+    _is_quantized_asymmetric  = is_data_type_quantized_asymmetric(src->data_type());
+    _is_prepared              = false;
+    _trans_weights_idx        = AuxTensorIdx::Count;
+    _enable_fast_math         = fc_info.enable_fast_math;
+    _fixed_format             = weights_info.weight_format() != WeightFormat::UNSPECIFIED;
+    _weight_format            = weights_info.weight_format();
+    _dynamic_weights          = !weights->are_values_constant() && _needs_weights_reshape;
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    const ITensorInfo *weights_to_use = weights;
+
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = dst->dimension(1) > 1;
+    if (is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        _is_fc_after_conv = src->num_dimensions() > 1;
+    }
+
+    // Reshape weights if needed
+    if (_needs_weights_reshape)
+    {
+        // Reshape the weights
+        _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
+        _transpose_weights->configure(weights, &_reshaped_weights);
+        _reshaped_weights.set_are_values_constant(weights->are_values_constant());
+
+        weights_to_use     = &_reshaped_weights;
+        _trans_weights_idx = AuxTensorIdx::TransposedWeights;
+    }
+
+    // Convert weights if needed
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Convert weights
+        _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
+        _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(),
+                                    fc_info.weights_trained_layout);
+        _converted_weights.set_are_values_constant(weights_to_use->are_values_constant());
+
+        weights_to_use            = &_converted_weights;
+        _needs_weights_conversion = true;
+        _trans_weights_idx        = AuxTensorIdx::ConvertedWeights;
+    }
+
+    if (_is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
+    }
+
+    // Retain the tensorinfo with the weights to use
+    if (_needs_weights_reshape || _needs_weights_conversion)
+    {
+        _trans_weights = *weights_to_use;
+    }
+
+    // Set auxiliary memory requirements
+    auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
+    for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+    {
+        _aux_mem[i] = gemm_mem_req[i];
+    }
+
+    if (_aux_mem[Pretranspose].size > 0)
+    {
+        // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
+        // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
+        // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time.
+        _aux_mem[TransposedWeights] = MemoryInfo(
+            offset_int_vec(TransposedWeights),
+            _dynamic_weights                                                           ? MemoryLifetime::Temporary
+            : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent
+                                                                                       : MemoryLifetime::Prepare,
+            _reshaped_weights.total_size());
+
+        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                _converted_weights.total_size());
+    }
+    else
+    {
+        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                 _dynamic_weights            ? MemoryLifetime::Temporary
+                                                 : _needs_weights_conversion ? MemoryLifetime::Prepare
+                                                                             : MemoryLifetime::Persistent,
+                                                 _reshaped_weights.total_size());
+
+        _aux_mem[ConvertedWeights] = MemoryInfo(
+            offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
+            _converted_weights.total_size());
+    }
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+}
+
+Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                       const ITensorInfo         *src,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *dst,
+                                       FullyConnectedLayerInfo    fc_info,
+                                       WeightsInfo                weights_info)
+{
+    GEMMInfo gemm_info;
+    gemm_info.set_activation_info(fc_info.activation_info);
+    gemm_info.set_fast_math(fc_info.enable_fast_math);
+    gemm_info.set_fixed_format(weights_info.weight_format() != WeightFormat::UNSPECIFIED);
+    gemm_info.set_weight_format(weights_info.weight_format());
+
+    return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
+}
+
+Status CpuFullyConnected::validate(const ITensorInfo      *src,
+                                   const ITensorInfo      *weights,
+                                   const ITensorInfo      *biases,
+                                   const ITensorInfo      *dst,
+                                   FullyConnectedLayerInfo fc_info,
+                                   const WeightsInfo      &weights_info)
+{
+    ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+
+    if (is_fixed_format_fast_math(weights_info.weight_format()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(src, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(weights, DataType::BFLOAT16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+
+    bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+    bool is_fc_after_conv = true;
+
+    const ITensorInfo &flatten_src =
+        TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = weights_reshaped
+                                               ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                                               : TensorInfo(*reshaped_weights.clone());
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    const ITensorInfo *src_to_use     = src;
+    const ITensorInfo *weights_to_use = weights;
+
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = dst->dimension(1) > 1;
+
+    if (biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        if (is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+    }
+
+    if (is_batched_fc_layer)
+    {
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        is_fc_after_conv = src->num_dimensions() > 1;
+    }
+
+    if (!weights_reshaped)
+    {
+        // Validate reshape weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
+        weights_to_use = &reshaped_weights;
+    }
+
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Validate convert weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
+        weights_to_use = &converted_weights;
+    }
+
+    if (is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+        // Validate flatten kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
+        src_to_use = &flatten_src;
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
+    }
+    // Validate matrix multiply kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info,
+                                            fc_info.enable_fast_math, weights_info.weight_format()));
+
+    return Status{};
+}
+
+void CpuFullyConnected::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    ++_asrt_run_count;
+    ARM_COMPUTE_ERROR_ON(_dynamic_weights && _asrt_prepare_count != _asrt_run_count);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
+    auto src = tensors.get_const_tensor(ACL_SRC_0);
+
+    CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
+    CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
+
+    // Linearize src if it comes from a convolutional layer
+    if (_is_fc_after_conv)
+    {
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
+        _flatten->run(flatten_pack);
+    }
+
+    ITensorPack gemm_pack = tensors;
+    gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
+    if (_needs_weights_reshape || _needs_weights_conversion)
+    {
+        gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
+    }
+
+    // Run matrix multiply
+    if (_is_quantized_asymmetric)
+    {
+        _mm_gemmlowp->run(gemm_pack);
+    }
+    else
+    {
+        _mm_gemm->run(gemm_pack);
+    }
+}
+
+void CpuFullyConnected::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared || _dynamic_weights)
+    {
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+        ++_asrt_prepare_count;
+        ARM_COMPUTE_ERROR_ON(!_dynamic_weights && _asrt_prepare_count > 1);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
+        auto weights = tensors.get_const_tensor(ACL_SRC_1);
+
+        CpuAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
+        CpuAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
+
+        // Pointer to current weights
+        const ITensor *cur_weights = weights;
+
+        // Reshape of the weights (happens only once)
+        if (_needs_weights_reshape)
+        {
+            // Run reshape weights kernel and mark weights as unused
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(),
+                                           transpose_pack);
+
+            cur_weights->mark_as_unused();
+            cur_weights = reshaped_weights.get();
+        }
+
+        // Convert weights if needed (happens only once)
+        if (_needs_weights_conversion)
+        {
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
+            _convert_weights->run(convert_pack);
+
+            cur_weights->mark_as_unused();
+            cur_weights = converted_weights.get();
+        }
+
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
+
+        // Prepare GEMM prepare and release unused weights
+        if (!_is_quantized_asymmetric)
+        {
+            _mm_gemm->prepare(gemm_pack);
+        }
+        else
+        {
+            _mm_gemmlowp->prepare(gemm_pack);
+        }
+
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements CpuFullyConnected::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
new file mode 100644
index 0000000000..b72f77e5c4
--- /dev/null
+++ b/src/cpu/operators/CpuFullyConnected.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
+#define ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+// Forward declarations
+class CpuConvertFullyConnectedWeights;
+class CpuFlatten;
+class CpuGemm;
+class CpuGemmLowpMatrixMultiplyCore;
+namespace kernels
+{
+class CpuTransposeKernel;
+} // namespace kernels
+/** Basic function to compute a Fully Connected layer. This function calls the following kernels:
+ *  -# @ref kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref kernels::CpuTransposeKernel (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ *  -# @ref CpuGemm or @ref CpuGemmLowpMatrixMultiplyCore (if quantized asymmetric)
+ *  -# @ref kernels::CpuGemmMatrixAdditionKernel or @ref CpuGemmLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CpuFullyConnected : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuFullyConnected();
+    /** Destructor */
+    ~CpuFullyConnected();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  src          Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights      Weights tensor info. The weights must be 2 dimensional.
+     *                          If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+     *                          If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+     *                          Data type supported: Same as @p src.
+     * @param[in]  biases       Bias tensor info. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst          Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
+     *                          - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+     *                          - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+     *                          Data type supported: Same as @p src.
+     * @param[in]  fc_info      (Optional) Fully connected layer additional info
+     * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
+     */
+    void configure(const ITensorInfo      *src,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
+     *
+     * Similar to @ref CpuFullyConnected::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
+
+    /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
+     * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
+     * as in @ref CpuFullyConnectedLayer::validate() except that all arguments are required.
+     *
+     * @return a status
+     */
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               FullyConnectedLayerInfo    fc_info,
+                               WeightsInfo                weights_info);
+
+    //Inherited methods override
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    void configure_fc_fc(const ITensorInfo         *src,
+                         const ITensorInfo         *weights,
+                         const ITensorInfo         *biases,
+                         ITensorInfo               *dst,
+                         const ActivationLayerInfo &act);
+    void configure_conv_fc(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           ITensorInfo               *dst,
+                           const ActivationLayerInfo &act);
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *dst,
+                      const ActivationLayerInfo &act);
+
+    enum AuxTensorIdx
+    {
+        AsmGemmWorkspace = 0,
+        Pretranspose,
+        GemmTemp1,
+        GemmTemp2,
+        GemmTemp3,
+        GemmTemp4,
+        GemmTemp5,
+        GemmTemp6,
+        GemmTemp7,
+        GemmTemp8,
+        // Slots above (0-9) reserved for either CpuGemm or CpuGemmLowpMatrixMultiplyCore
+        TransposedWeights,
+        ConvertedWeights,
+        FlattenedSrc,
+        Count
+    };
+
+    std::unique_ptr<CpuFlatten>                      _flatten;
+    std::unique_ptr<CpuConvertFullyConnectedWeights> _convert_weights;
+    std::unique_ptr<kernels::CpuTransposeKernel>     _transpose_weights;
+    std::unique_ptr<CpuGemm>                         _mm_gemm;
+    std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>   _mm_gemmlowp;
+
+    TensorInfo   _flattened_src;
+    TensorInfo   _converted_weights;
+    TensorInfo   _reshaped_weights;
+    TensorInfo   _trans_weights;
+    AuxTensorIdx _trans_weights_idx;
+
+    experimental::MemoryRequirements _aux_mem;
+
+    bool                      _needs_weights_conversion;
+    bool                      _needs_weights_reshape;
+    bool                      _is_fc_after_conv;
+    bool                      _is_quantized_asymmetric;
+    bool                      _is_prepared;
+    bool                      _enable_fast_math;
+    bool                      _fixed_format;
+    arm_compute::WeightFormat _weight_format;
+    bool                      _dynamic_weights;
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    int _asrt_run_count{};
+    int _asrt_prepare_count{};
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
new file mode 100644
index 0000000000..905e86c185
--- /dev/null
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemm.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::experimental;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+    cpu::AsmGemmInfo asm_info;
+    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+    asm_info.fast_mode               = info.fast_math();
+    asm_info.fixed_format            = info.fixed_format();
+    asm_info.weight_format           = info.weight_format();
+    asm_info.accumulate              = info.accumulate();
+    asm_info.transpose_b =
+        info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method
+
+    return asm_info;
+}
+} // namespace
+
+void CpuGemm::configure(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        ITensorInfo       *d,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info);
+
+    const cpu::AsmGemmInfo asm_info  = init_assembly_metadata(gemm_info);
+    const bool             is_c_bias = beta == 1 && c != nullptr;
+    const bool             run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+
+    // Check if we need to reshape the matrix B only on the first run
+    _is_prepared                      = false;
+    _reshape_b_only_on_first_run      = b->are_values_constant();
+    _run_vector_matrix_multiplication = a->dimension(1) < 2;
+    _run_alpha_scale                  = alpha != 1.f;
+    _run_bias_addition                = is_c_bias;
+    _run_addition                     = beta != 0 && beta != 1 && c != nullptr;
+    _run_activation =
+        gemm_info.activation_info().enabled() &&
+        (!run_optimised ||
+         (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+
+    if (run_optimised)
+    {
+        _run_interleave_transpose   = false;
+        const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
+        _asm_glue                   = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
+        _asm_glue->configure(a, b, c_to_use, d, asm_info);
+        ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
+
+        const auto asm_mem_req = _asm_glue->workspace();
+        for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = asm_mem_req[slot];
+        }
+
+        // Scale product by alpha
+        if (_run_alpha_scale)
+        {
+            _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
+            _alpha_scale_func->configure(
+                d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+        }
+    }
+    else
+    {
+        _run_interleave_transpose = !_run_vector_matrix_multiplication;
+        // Pick output tensor in case bias addition should be performed
+        ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
+        // Pick b tensor in case pretranspose should be performed
+        const ITensorInfo *b_to_use = b;
+
+        _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
+
+        // Configure rhs pretranspose
+        if (gemm_info.pretranspose_B())
+        {
+            _pretranspose_b_func = std::make_unique<CpuTranspose>();
+            _pretranspose_b_func->configure(b_to_use, &_pretransposed_b);
+            MemoryLifetime lifetime;
+            if (_reshape_b_only_on_first_run)
+            {
+                if (_run_interleave_transpose)
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS
+                    // So PreTransposedRHS can be freed inside prepare()
+                    lifetime = MemoryLifetime::Prepare;
+                }
+                else
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs
+                    // So PreTransposedRHS needs to persist beyond prepare()
+                    lifetime = MemoryLifetime::Persistent;
+                }
+            }
+            else
+            {
+                // PreTransposedRHS tensor is always used in run() and doesn't need to persist
+                lifetime = MemoryLifetime::Temporary;
+            }
+            _aux_mem[PreTransposedRHS] =
+                MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size());
+            b_to_use = &_pretransposed_b;
+        }
+
+        // Select between GEMV and GEMM
+        if (_run_vector_matrix_multiplication)
+        {
+            // Configure the matrix multiply kernel
+            _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose);
+            // Configure interleave kernel
+            _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
+            _interleave_kernel->configure(a, &_tmp_a);
+            _aux_mem[InterleavedLHS] =
+                MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
+
+            // Configure rhs transpose1xw kernel
+            _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
+            _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b);
+            _aux_mem[Transposed1xWRHS] =
+                MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+
+            // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b
+            const int m = a->dimension(1);
+            const int n = b_to_use->dimension(0);
+            const int k = a->dimension(0);
+
+            // Configure matrix multiplication kernel
+            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose,
+                                  GEMMReshapeInfo(m, n, k));
+        }
+
+        if (_run_bias_addition)
+        {
+            _add_bias = std::make_unique<cpu::CpuAdd>();
+            _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
+            _aux_mem[TempResult] =
+                MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
+        }
+    }
+
+    // Configure matrix addition kernel
+    if (_run_addition)
+    {
+        _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
+        _ma_kernel->configure(c, d, beta);
+    }
+
+    // Configure activation
+    if (_run_activation)
+    {
+        _activation_func = std::make_unique<cpu::CpuActivation>();
+        _activation_func->configure(d, nullptr, gemm_info.activation_info());
+    }
+}
+
+Status CpuGemm::validate(const ITensorInfo *a,
+                         const ITensorInfo *b,
+                         const ITensorInfo *c,
+                         const ITensorInfo *d,
+                         float              alpha,
+                         float              beta,
+                         const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    // When using accumulation(in place summation), for now, the only supported values for alpha and beta are 1 respectively 0.
+    // Do the appropriate checks before proceeding.
+    if (gemm_info.accumulate())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(alpha != 1, "Accumulation is not supported when alpha is different from 1");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (beta != 0 && c != nullptr),
+            "Accumulation is not supported when beta is different from 0 with a non-null bias matrix c");
+    }
+
+    const bool is_c_bias    = beta == 1 && c != nullptr;
+    const bool run_addition = c != nullptr && beta != 0 && beta != 1;
+    // Check if we should use the pretransposed_b or original b
+    // TODO: COMPMID-6597
+    // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning
+    // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision
+    // We should simplify this by
+    //   1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch)
+    //   2. Moving this b_to_use check back into the non-optimized path
+    TensorInfo pretransposed_b  = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b));
+    const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b;
+    // TODO: COMPMID-6597 fix-start
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
+
+    if (is_fixed_format_fast_math(gemm_info.weight_format()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use);
+    }
+
+    const int block_by = arm_compute::block_by(gemm_info.weight_format());
+    // test if im2col has changed the dimensions that are needed for padding
+    if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1)
+    {
+        // have to verify bias
+        const size_t dim0_sz = a->dimension(0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz % block_by) != 0,
+            ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
+        // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
+        // b_to_use->dimension(1) = kernel_area * input_channel
+        // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right
+        const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by;
+        const size_t kernel_area     = (dim0_sz - b_to_use->dimension(1)) / input_pad_right;
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1),
+            "The product AB is defined only if A number of columns and B number of rows are related");
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            a->dimension(0) != b_to_use->dimension(1),
+            "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+    if (a->data_type() != DataType::BFLOAT16)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
+    }
+
+    if (run_addition)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
+                                        "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0),
+                                        "The C matrix must have the same number of columns as the matrix B");
+    }
+
+    if (d->total_size() != 0)
+    {
+        // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
+        ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0));
+        if (gemm_info.depth_output_gemm3d() != 0)
+        {
+            if (gemm_info.reinterpret_input_as_3d())
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1) * d->dimension(2));
+            }
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
+        }
+    }
+    // TODO: COMPMID-6597 fix-end
+
+    // Check if we need to run the optimized assembly kernel
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+
+    // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag
+    // so we pass the original b to CpuGemmAssemblyDispatch
+    const bool run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+
+    if (!run_optimised)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(),
+                                        "CpuGemm cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0,
+                                        "CpuGemm cannot reinterpret the output tensor as 3D");
+
+        // Check if the first input tensor is a vector.
+        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+        // Check if we need to reshape the matrix A and matrix B
+        const bool run_interleave_transpose = !run_vector_matrix_multiplication;
+
+        // Arguments used by GEMMReshapeInfo
+        // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
+        // in order to know how the matrices have been reshaped
+        const int m                         = a->dimension(1);
+        const int n                         = b_to_use->dimension(0);
+        const int k                         = a->dimension(0);
+        int       mult_transpose1xW_width   = 1;
+        int       mult_interleave4x4_height = 1;
+
+        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(
+            m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+
+        const ITensorInfo *matrix_a_info = a;
+        const ITensorInfo *matrix_b_info = b_to_use;
+
+        TensorInfo tmp_a_info{};
+        TensorInfo tmp_b_info{};
+        TensorInfo tmp_output_info = *d->clone();
+
+        if (run_interleave_transpose)
+        {
+            matrix_a_info = &tmp_a_info;
+            matrix_b_info = &tmp_b_info;
+
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(
+                                               *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
+
+            // Validate transpose kernel
+            auto_init_if_empty(tmp_b_info,
+                               b_to_use->clone()->set_tensor_shape(
+                                   compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width)));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info));
+        }
+
+        // Validate matrix multiply
+        auto_init_if_empty(tmp_output_info,
+                           matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(
+                               *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(
+            matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+
+        if (is_c_bias)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
+        }
+    }
+
+    // Validate matrix addition kernel
+    if (run_addition)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
+    }
+
+    // Validate activation
+    const ActivationLayerInfo &activation = gemm_info.activation_info();
+    if (activation.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
+    }
+
+    return Status{};
+}
+
+void CpuGemm::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    auto a = tensors.get_const_tensor(ACL_SRC_0);
+    auto b = tensors.get_const_tensor(ACL_SRC_1);
+    auto c = tensors.get_const_tensor(ACL_SRC_2);
+    auto d = tensors.get_tensor(ACL_DST);
+
+    if (_asm_glue && _asm_glue->is_configured())
+    {
+        // Pass c to asm dispatch only if it's the bias tensor
+        ITensorPack asm_pack = tensors;
+        asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr);
+        _asm_glue->run(asm_pack);
+        if (_run_alpha_scale)
+        {
+            ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
+            _alpha_scale_func->run(pack);
+        }
+    }
+    else
+    {
+        CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
+        CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors);
+        CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true);
+        CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
+
+        ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
+
+        if (_run_interleave_transpose)
+        {
+            // Run interleave kernel
+            ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
+            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
+                                           interleave_pack);
+            // Use reshaped matrices
+            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
+        }
+
+        const ITensor *b_to_use = b;
+        if (_pretranspose_b_func)
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
+            }
+            b_to_use = pretransposed_b.get();
+        }
+        if (_run_interleave_transpose)
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose1xw kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
+            b_to_use = transposed1xw_b.get();
+        }
+        // Use reshaped matrices
+        mm_pack.add_const_tensor(ACL_SRC_1, b_to_use);
+
+        NEScheduler::get().schedule_op(_mm_kernel.get(),
+                                       _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
+                                       _mm_kernel->window(), mm_pack);
+
+        // Run bias addition kernel
+        if (_run_bias_addition)
+        {
+            ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}};
+            _add_bias->run(pack);
+        }
+    }
+
+    // Run matrix addition kernel
+    if (_run_addition)
+    {
+        ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}};
+        NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
+    }
+
+    // Run activation function
+    if (_run_activation)
+    {
+        ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
+        _activation_func->run(pack);
+    }
+}
+
+void CpuGemm::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        if (_asm_glue && _asm_glue->is_configured())
+        {
+            _asm_glue->prepare(tensors);
+        }
+        else if (_reshape_b_only_on_first_run)
+        {
+            const ITensor      *b        = tensors.get_const_tensor(ACL_SRC_1);
+            const ITensor      *b_to_use = b;
+            CpuAuxTensorHandler pretransposed_b(
+                offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors,
+                false /*pack_inject: no need to inject into tensors*/,
+                _pretranspose_b_func ==
+                    nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/);
+            CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors,
+                                                false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/);
+
+            if (_pretranspose_b_func)
+            {
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
+                b_to_use = pretransposed_b.get();
+            }
+            if (_run_interleave_transpose)
+            {
+                // Run transpose kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
+        }
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements CpuGemm::workspace() const
+{
+    return _aux_mem;
+}
+
+Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                             const ITensorInfo         *a,
+                             const ITensorInfo         *b,
+                             const ITensorInfo         *c,
+                             const ITensorInfo         *d,
+                             const GEMMInfo            &gemm_info)
+{
+    const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+
+    return CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, asm_info);
+}
+
+bool CpuGemm::isVarWeightsKernel() const
+{
+    return _asm_glue && _asm_glue->isVarWeightsKernel();
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
new file mode 100644
index 0000000000..a05258d206
--- /dev/null
+++ b/src/cpu/operators/CpuGemm.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMM_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMM_H
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/GEMMInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
+#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuAdd.h"
+#include "src/cpu/operators/CpuTranspose.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to execute GEMM. This function calls the following kernels:
+ *
+ * If optimized assembly is available:
+ *  -# @ref cpu::CpuGemmAssemblyDispatch
+ *  -# @ref cpu::CpuActivation (if alpha != 1.0)
+ * Else:
+ *  -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix)
+ *  -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix)
+ *  -# @ref cpu::kernels::CpuGemmMatrixMultiplyKernel
+ * In both cases:
+ *  -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once)
+ * Else:
+ *  -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place)
+ *
+ *  -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo)
+ */
+class CpuGemm : public ICpuOperator
+{
+public:
+    /** Default constructor */
+    CpuGemm() = default;
+    /** Default destructor */
+    ~CpuGemm() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
+     * |BFLOAT16     |BFLOAT16    |BFLOAT16  |FP32           |
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
+     *
+     * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+     *
+     * @param[in]  a         First input tensor info (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32
+     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: same as @p a
+     * @param[in]  c         Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
+     * @param[out] d         Output tensor info. Data type supported: same as @p a
+     * @param[in]  alpha     Weight of the matrix product
+     * @param[in]  beta      Weight of matrix C
+     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                       if the reshape of matrix B should happen only for the first run
+     */
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   float              alpha,
+                   float              beta,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
+     *
+     * Similar to @ref CpuGemm::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
+
+    /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
+     *
+     * This method has the same use of @ref
+     * NEGEMMConvolutionLayer::has_opt_impl, with the only caveat that
+     * the value of arm_compute::WeightFormat need to be passed via the
+     * parameter gemm_info.
+     */
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+    /** Indicates if the convolution executes in variable weights mode.
+     *
+     * When ACL executes convolution in variable weights mode, it does
+     * not perform any processing of the weights tensor. Instead, it
+     * utilizes the data as it is given by the user.
+     */
+    bool isVarWeightsKernel() const;
+
+private:
+    enum AuxTensorIdx
+    {
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        InterleavedLHS = 3,
+        PreTransposedRHS,
+        Transposed1xWRHS,
+        TempResult,
+        Count
+    };
+
+    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{nullptr};
+    std::unique_ptr<CpuTranspose>                         _pretranspose_b_func{nullptr};
+    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose1xW_b_kernel{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr};
+    std::unique_ptr<CpuActivation>                        _alpha_scale_func{nullptr};
+    std::unique_ptr<CpuAdd>                               _add_bias{nullptr};
+    std::unique_ptr<CpuActivation>                        _activation_func{nullptr};
+
+    TensorInfo _tmp_a{};
+    TensorInfo _pretransposed_b{};
+    TensorInfo _tmp_b{};
+    TensorInfo _tmp_d{};
+
+    bool _run_vector_matrix_multiplication{false};
+    bool _run_interleave_transpose{
+        true}; /**< If we run CpuGemmInterleave4x4Kernel on lhs and CpuGemmTranspose1xWKernel on rhs */
+    bool _run_alpha_scale{false};
+    bool _run_addition{false};
+    bool _run_bias_addition{false};
+    bool _run_activation{false};
+    bool _reshape_b_only_on_first_run{false};
+    bool _is_prepared{false};
+
+    experimental::MemoryRequirements _aux_mem{Count};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMM_H
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
new file mode 100644
index 0000000000..55d950ff4a
--- /dev/null
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -0,0 +1,992 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmConv2d.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/Utils.h"
+#include "src/cpu/kernels/CpuCol2ImKernel.h"
+#include "src/cpu/kernels/CpuIm2ColKernel.h"
+#include "src/cpu/kernels/CpuWeightsReshapeKernel.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
+#include "src/cpu/operators/CpuReshape.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+#include <set>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+/** @section note_CpuGemmConv2d_weight_transformation Weight Transformations in CpuGemmConv2d
+ *
+ * A. Terminology
+ *      Throughout CpuGemmConv2d, we use the following terms in ways that may differ from other operators / kernels:
+ *          - "Transform" or "Reshape" of the weights: they both mean all the operations that we perform on the weight
+ *             tensor up until they are consumed by gemm (CpuGemm or CpuGemmLowpMatrixMultiplyCore)
+ *             Note that the specific gemm operator may perform further transformations on the weights, but the
+ *             transformations here only mean those performed in CpuGemmConv2d
+ *          - "Transpose" of weights: The @ref CpuTranspose operation. I.e. transpose of the weights' lowest two
+ *             dimensions
+ *
+ * B. Gemm-based conv2d
+ *      We want to convert the 2d convolution op (ignoring bias):
+ *          dst = conv2d(src, weight)
+ *      into a matrix multiplication op:
+ *          gemm_dst = gemm(lhs, rhs)
+ *
+ *      E.g.: For data layout NHWC
+ *                               3 (hi) <----------> (lo) 0
+ *               src.shape =    [batch,  in_h , in_w,  in_c]
+ *               weight.shape = [out_c,   k_h ,  k_w,  in_c]
+ *               dst.shape =    [batch, out_h, out_w, out_c]
+ *
+ *      This requires three transformations:
+ *          * src -> lhs, transform conv input to gemm lhs; gemm_lhs is a 2d matrix where each row (or column,
+ *                          depending on the convention) is a linearized "patch" of the conv_input that corresponds to
+ *                          the receptive field of the corresponding output element.
+ *                          The convention is to use "column", but to disambiguate from the column vector of a matrix,
+ *                          in this documentation we shall use "patch".
+ *                          This transform is called im2col (for details see @ref CpuIm2ColKernel)
+ *          * weight -> rhs, transform conv weight to gemm rhs, known as weight transform/reshape (wt)
+ *          * gemm_dst -> dst, transform gemm output back to conv output, known as col2im (for details see
+ *                          @ref CpuCol2ImKernel)
+ *
+ *      This section focuses on the weight transformation and assumes the im2col is already performed
+ *
+ * C. Weight Transformation
+ *      After im2col, assume: lhs.shape = [num_patch, patch_size],
+ *          where patch_size is the number of elements in a "patch": patch_size = k_h * k_w * in_c
+ *                num_patch is the number of patches; we can ignore it here (for details see @ref CpuIm2ColKernel)
+ *
+ *      After wt, rhs should have the shape: rhs = [patch_size, out_c]
+ *
+ *      Therefore, the weight transformation consists of two steps:
+ *          1. Collapsing all 3 spatial dimensions: [out_c, k_h, k_w, in_c] -> [out_c, patch_size]
+ *          2. Transpose the collapsed shape: [out_c, patch_size] -> [patch_size, out_c]
+ *
+ * D. Implementation
+ *      There are 4 paths for weight transformation
+ *
+ *      1. Path 1: Fixed weight format - no transformation
+ *          The underlying gemm kernel may adopt fixed weight format (isVarWeightsKernel() == true), which requires
+ *          that no weight transformation shall be performed
+ *          Note that this no-transform requirement applies both to this op (CpuGemmConv2d) and the constituent ops, up
+ *          until the fixed format kernels themselves
+ *
+ *      2. Path 2: Reinterpret then transpose later
+ *          If the weight tensor has no "holes" (see @ref has_holes), there are two optimizations we can apply:
+ *              - We can ignore the first step (collapsing of spatial dimensions) by simply re-interpreting the shape
+ *                in TensorInfo
+ *              - Instead of performing transpose here, we can pass the transpose flag to the underlying gemm. The gemm
+ *                may then decide to fuse the transpose with any further transformations
+ *
+ *      3. Path 3: Reshape then transpose later
+ *          If the weight tensor has holes, then we use a dedicated @ref CpuReshape, followed by transpose later
+ *
+ *      4. Path 4: Fused reshape and transpose
+ *          This is only for quantized types for now (TODO: Remove (COMPMID-6596)). We fall back to a legacy
+ *          non-optimized kernel @ref CpuWeightsReshapeKernel to perform a fused reshape + transpose
+ *
+ *      Path 1 is the long term solution that we shall migrate to once (if) we adopt fixed weight format for all gemm
+ *      kernels.
+ *      In the short term, Path 2 is the favored, more performant path.
+ */
+
+namespace
+{
+/** Initialize reshaped / transformed weight info
+ *
+ * @param[in]  weights          Input weights
+ * @param[out] reshaped_weights Transformed weights
+ */
+void initialize_reshaped_weight_info(const ITensorInfo &weights, ITensorInfo &reshaped_weights)
+{
+    auto_init_if_empty(reshaped_weights, weights);
+    if (is_data_type_quantized(weights.data_type()))
+    {
+        // WT method: FusedReshapeAndTranspose
+        reshaped_weights.set_tensor_shape(compute_weights_reshaped_shape(weights, /* has_bias */ false));
+    }
+    else
+    {
+        TensorShape collapsed_weights = weights.tensor_shape();
+        collapsed_weights.collapse(3);
+        reshaped_weights.set_tensor_shape(collapsed_weights);
+    }
+}
+} // namespace
+
+CpuGemmConv2d::WeightTransformMethod CpuGemmConv2d::get_wt_method(const ITensorInfo &weights)
+{
+    // TODO: Extend ReinterpretThenTranspose support for quantized data types COMPMID-6596
+    if (is_data_type_quantized(weights.data_type()))
+    {
+        return WeightTransformMethod::FusedReshapeAndTranspose;
+    }
+    return has_holes(weights) ? WeightTransformMethod::ReshapeThenTranspose
+                              : WeightTransformMethod::ReinterpretThenTranspose;
+}
+
+CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const PadStrideInfo       &conv_info,
+                                                        const Size2D              &dilation,
+                                                        const ActivationLayerInfo &act_info)
+{
+    const DataLayout   data_layout   = src->data_layout();
+    const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int          idx_height    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+    unsigned int       conv_w        = 0;
+    unsigned int       conv_h        = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
+    const bool skip_im2col   = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+    if (skip_im2col)
+    {
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
+        if (skip_col2im)
+        {
+            return {true, true};
+        }
+    }
+    else
+    {
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
+        if (skip_col2im)
+        {
+            return {false, true};
+        }
+    }
+
+    // Default case when we cannot reinterpret the input and output as 3D.
+    return {false, false};
+}
+
+CpuGemmConv2d::CpuGemmConv2d()
+    : _weights_reshape(nullptr),
+      _weights_reshape_and_transpose_kernel(nullptr),
+      _im2col_kernel(),
+      _mm_gemm(),
+      _mm_gemmlowp(),
+      _col2im_kernel(),
+      _reshape(),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _gemm_output_3d(),
+      _data_layout(DataLayout::NCHW),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _is_prepared(false),
+      _wt_method(WeightTransformMethod::ReshapeThenTranspose),
+      _run_wt(true),
+      _aux_mem(AuxTensorIdx::Count)
+{
+}
+CpuGemmConv2d::~CpuGemmConv2d() = default;
+
+void CpuGemmConv2d::configure_mm(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *biases,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math,
+                                 int                        gemm_3d_depth,
+                                 bool                       fixed_format,
+                                 arm_compute::WeightFormat  weight_format)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth,
+                                           _skip_im2col, fixed_format, weight_format));
+
+    // Supported activations in GEMM
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
+    if (_is_quantized)
+    {
+        TensorInfo tmp_src{*src};
+        TensorInfo tmp_weights{*weights};
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo        iqinfo    = src->quantization_info();
+        const QuantizationInfo        wqinfo    = weights->quantization_info();
+        const QuantizationInfo        oqinfo    = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+        const UniformQuantizationInfo uiqinfo   = iqinfo.uniform();
+        const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
+        const DataType                data_type = src->data_type();
+
+        tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+        if (!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+        {
+            const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+            tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+        }
+
+        // Merge activation with output stage
+        PixelValue type_min{};
+        PixelValue type_max{};
+        std::tie(type_min, type_max) = get_min_max(data_type);
+        int32_t min_activation       = type_min.get<int32_t>();
+        int32_t max_activation       = type_max.get<int32_t>();
+
+        if (supported_acts.count(act_info.activation()) != 0)
+        {
+            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset          = uoqinfo.offset;
+        output_info.gemmlowp_min_bound       = min_activation;
+        output_info.gemmlowp_max_bound       = max_activation;
+        output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL);
+        quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
+
+        _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst,
+                                GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false,
+                                         enable_fast_math, false, act_info, fixed_format, weight_format,
+                                         false /* pretranspose_B. TODO: COMPMID-6596 */));
+
+        auto mm_mem_req = _mm_gemmlowp->workspace();
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+    else
+    {
+        // Create GEMMInfo structure
+        const GEMMInfo &gemm_info =
+            GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                     _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                     GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format,
+                     true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this
+                     flag is ignored)*/);
+        // Configure matrix multiply function
+        _mm_gemm = std::make_unique<CpuGemm>();
+        _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+        auto mm_mem_req = _mm_gemm->workspace();
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+}
+
+Status CpuGemmConv2d::validate_mm(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col,
+                                  bool                       fixed_format,
+                                  arm_compute::WeightFormat  weight_format)
+{
+    const DataType data_type             = src->data_type();
+    const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
+    const bool     is_activation_enabled = act_info.enabled();
+
+    if (is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo       &iqinfo  = src->quantization_info();
+        const QuantizationInfo       &wqinfo  = weights->quantization_info();
+        const QuantizationInfo       &oqinfo  = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+
+        // Merge activation with output stage
+        PixelValue type_min{};
+        PixelValue type_max{};
+        std::tie(type_min, type_max) = get_min_max(data_type);
+        int32_t min_activation       = type_min.get<int32_t>();
+        int32_t max_activation       = type_max.get<int32_t>();
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+        if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset          = uoqinfo.offset;
+        output_info.gemmlowp_min_bound       = min_activation;
+        output_info.gemmlowp_max_bound       = max_activation;
+        output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
+
+        // Perform validation step on GEMMLowp
+        std::unique_ptr<ITensorInfo> input_qa   = src->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
+
+        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst,
+                                                       GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false,
+                                                                output_info, false, enable_fast_math, false, act_info,
+                                                                false /* pretranspose_B. TODO: COMPMID-6596 */));
+    }
+    else
+    {
+        // Create GEMMInfo structure
+        const GEMMInfo gemm_info =
+            GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                     skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                     GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format,
+                     true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this
+                     flag is ignored)*/);
+
+        // Perform validation step on Matrix multiply function
+        return CpuGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+    }
+}
+
+Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo         *input_info,
+                                      const ITensorInfo         *weights_info,
+                                      const ActivationLayerInfo &act_info,
+                                      int                        gemm_3d_depth,
+                                      bool                       skip_im2col)
+{
+    const DataType     data_type = input_info->data_type();
+    const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
+    const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
+
+    // Set dummy tensor shapes for the validation
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type,
+                                      input_info->quantization_info());
+    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type,
+                                       input_info->quantization_info());
+
+    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false,
+                       gemm_3d_depth, skip_im2col);
+}
+
+void CpuGemmConv2d::configure(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              ITensorInfo               *dst,
+                              const PadStrideInfo       &conv_info,
+                              const WeightsInfo         &weights_info,
+                              const Size2D              &dilation,
+                              const ActivationLayerInfo &act_info,
+                              bool                       enable_fast_math,
+                              unsigned int               num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_UNUSED(num_groups, weights_info);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation,
+                                                       act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math,
+                           num_groups);
+
+    const DataType   data_type   = src->data_type();
+    const DataLayout data_layout = src->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+
+    _is_prepared  = weights_info.retain_internal_weights();
+    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+    _data_layout  = data_layout;
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+    const ITensorInfo *gemm_input_to_use  = src;
+    ITensorInfo       *gemm_output_to_use = dst;
+
+    // Get convolved dimensions
+    unsigned int conv_w      = 0;
+    unsigned int conv_h      = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
+
+    ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
+                             "Output shape does not match the expected one");
+
+    // Check if GEMM3D is supported
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    _skip_im2col = skip_info.skip_im2col;
+    _skip_col2im = skip_info.skip_col2im;
+
+    // Get parameters from conv_info
+    unsigned int stride_x        = 0;
+    unsigned int stride_y        = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    // Initialize reshaped weights
+    initialize_reshaped_weight_info(*weights, _weights_reshaped);
+
+    // Create tensor to store im2col reshaped inputs
+    if (!_skip_im2col)
+    {
+        const int    block_by        = arm_compute::block_by(weights_info.weight_format());
+        unsigned int input_pad_right = 0;
+        if (block_by > 1)
+        {
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+        }
+        // Configure
+        _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
+        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation,
+                                  num_groups, input_pad_right);
+
+        // Update GEMM input
+        gemm_input_to_use = &_im2col_output;
+    }
+
+    const unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+
+    // Create temporary GEMM output tensor in case we cannot skip col2im
+    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+    if (!_skip_col2im)
+    {
+        TensorShape shape_gemm;
+
+        // Calculate GEMM output shape
+        shape_gemm = _im2col_output.tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+
+        _gemm_output = TensorInfo(shape_gemm, 1, output_data_type);
+        _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+        _gemm_output_3d = TensorInfo(_gemm_output);
+
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output;
+    }
+    else
+    {
+        _gemm_output_3d = TensorInfo(*dst);
+        _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true);
+        _gemm_output = TensorInfo(_gemm_output_3d);
+
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output_3d;
+    }
+
+    // Configure GEMM
+    // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
+    const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
+    /** @section note_CpuGemmConv2d_weight_use_in_configure  Which weights tensor should we use to configure gemm
+     *
+     *  A. The problem:
+     *      In principle, we should use the weights tensor corresponding to the weights transformation path. I.e.:
+     *          - If no weight transformation (_run_wt == false): Use original weights
+     *          - else:                                           Use transformed weights
+     *      However in practice we have a dilemma:
+     *          - We need to know _run_wt before we can configure gemm with the corresponding weights, but
+     *          - _run_wt depends on isVarWeightsKernel(), which is only known after gemm is configured
+     *
+     *  B. The decision:
+     *      To simplify the matter, we decide to always use the transformed weights, regardless of _run_wt
+     *
+     *      This decision requires the following conditions:
+     *          1. The underlying gemm where isVarWeightsKernel() == true, must guarantee that:
+     *              A. Ignore the flag to transpose weights (GEMMInfo::pretranspose_B)
+     *              B. Use weights/B tensor passed to it at prepare() or run() instead of that passed at configure()
+     *          2. CpuGemmConv2d where isVarWeightsKernel() == true, must guarantee that:
+     *              A. Pass original weights instead of reshaped or reinterpreted weights
+     *
+     *  C. Future actions:
+     *      Condition 2 is a given, based on our implementation.
+     *      If condition 1 cannot hold, we must make changes to the underlying gemm to:
+     *           1. Either expose isVarWeightsKernel() before gemm is configured somehow, or
+     *           2. Take in an additional "original_weights" tensor info at configure
+     */
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math,
+                 gemm_3d_depth, fixed_format, weights_info.weight_format());
+
+    // Can only decide isVarWeightsKernel after gemm is configured
+    _run_wt = !isVarWeightsKernel();
+
+    if (!_skip_col2im && _data_layout == DataLayout::NCHW)
+    {
+        // Configure col2im
+        _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
+        _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h));
+    }
+    else
+    {
+        // Configure reshape layer
+        _reshape = std::make_unique<CpuReshape>();
+        _reshape->configure(gemm_output_to_use, dst);
+    }
+
+    // Check lifetime
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    // Add WeightsReshaped memory requirement to workspace
+    // Note that in case of WeightTransformMethod::ReinterpretThenTranspose, we do not need to allocate this memory
+    // However since we cannot determine weight transformation method until prepare (see prepare()), we will have to
+    // settle with allocating more
+    if (_run_wt)
+    {
+        // Check if GEMM transforms weights
+        // If weight is further transformed by underlying gemm after ReshapeThenTranspose then we can free
+        // WeightsReshaped in prepare
+        // Otherwise WeightsReshaped is the final transformation of weights and needs to persist
+        bool gemm_trans_wei = _aux_mem[GemmAsmPretransposedRHS].size > 0;
+        gemm_trans_wei      = _mm_gemm != nullptr ? _aux_mem[GemmTransposed1xWRHS].size > 0 : gemm_trans_wei;
+        gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[GemmLowpTransposed1xWRHS].size > 0 : gemm_trans_wei;
+
+        _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
+                                               gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
+                                               _weights_reshaped.total_size());
+    }
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+}
+
+Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                   const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   const bool                 enable_fast_math)
+{
+    const DataLayout   data_layout   = src->data_layout();
+    const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int          idx_height    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+    unsigned int       conv_w        = 0;
+    unsigned int       conv_h        = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
+
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+
+    const bool         skip_im2col   = skip_info.skip_im2col;
+    const bool         skip_col2im   = skip_info.skip_col2im;
+    const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0;
+    const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
+
+    /** @section note_CpuGemmConv2d_weight_use_in_has_opt_impl Which weights tensor should we use for has_opt_impl
+     *
+     *  For the pretranspose_B flag, this shares a similar problem and thus the same decision as that of
+     *  @ref note_CpuGemmConv2d_weight_use_in_configure
+     *
+     *  But for the weights, we shall always use the original instead of reshaped weights here
+     */
+    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                                        skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                                        GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info,
+                                        fixed_format, weights_info.weight_format(), true /* pretranspose_B */);
+
+    return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
+}
+
+Status CpuGemmConv2d::validate(const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info,
+                               const Size2D              &dilation,
+                               const ActivationLayerInfo &act_info,
+                               bool                       enable_fast_math,
+                               unsigned int               num_groups)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+
+    if (!is_fixed_format(weights_info.weight_format()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
+
+    const DataLayout data_layout = src->data_layout();
+    const DataType   data_type   = src->data_type();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         tmp_info{};
+    TensorInfo         weights_reshaped_info{};
+    const ITensorInfo *gemm_input_to_use  = src;
+    const ITensorInfo *gemm_output_to_use = dst;
+    const ITensorInfo *weights_to_use     = weights;
+
+    const bool append_bias  = false;
+    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+    const bool is_bf16      = data_type == DataType::BFLOAT16;
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
+
+    // Check if GEMM3D is supported
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    // Validate biases
+    if (biases != nullptr)
+    {
+        if (is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else if (is_bf16)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != dst->dimension(idx_channel));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+    unsigned int mat_weights_rows =
+        weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+
+    // Initialize reshaped weights
+    initialize_reshaped_weight_info(*weights, weights_reshaped_info);
+    // No need to call CpuReshape::validate() or CpuTranspose::validate() as the dst info is auto-configured from the
+    // src
+    weights_to_use = &weights_reshaped_info;
+
+    if (!skip_im2col)
+    {
+        const int block_by        = arm_compute::block_by(weights_info.weight_format());
+        int       input_pad_right = 0;
+        if (block_by > 1)
+        {
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+            mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) *
+                               (weights->dimension(idx_channel) + input_pad_right);
+        }
+
+        // Create tensor info for im2col reshaped inputs
+        // For CPU, the batch size is on the fourth dimension
+        TensorShape shape_im2col = src->tensor_shape();
+        shape_im2col.set(0, mat_weights_rows);
+        shape_im2col.set(1, conv_w * conv_h);
+        shape_im2col.set(2, 1);
+
+        im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
+        im2col_reshaped_info.set_quantization_info(src->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height),
+                                               conv_info, append_bias, dilation, num_groups, input_pad_right));
+        gemm_input_to_use = &im2col_reshaped_info;
+    }
+
+    // Create temporary GEMM output tensor in case we cannot skip col2im
+    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+    if (!skip_col2im)
+    {
+        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+        info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
+    }
+    else
+    {
+        info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type);
+    }
+    info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+    gemm_output_to_use      = &info_gemm;
+    const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
+
+    // See note_CpuGemmConv2d_weight_use_in_configure regarding the choice of the weights
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info,
+                                            enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
+                                            weights_info.weight_format()));
+
+    // Validate Col2Im/ReshapeLayer
+    if (!skip_col2im && (data_layout == DataLayout::NCHW))
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+    }
+
+    return Status{};
+}
+
+void CpuGemmConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    auto src               = tensors.get_const_tensor(ACL_SRC_0);
+    auto dst               = tensors.get_tensor(ACL_DST);
+    auto gemm_input_to_use = src;
+
+    CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
+    CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+
+    bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
+    if (!_skip_im2col)
+    {
+        // Run input reshaping
+        unsigned int hint_dim            = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+        unsigned int x_dim               = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+        unsigned int hint_dim_iterations = _im2col_kernel->window().num_iterations(hint_dim);
+        unsigned int x_dim_iterations    = _im2col_kernel->window().num_iterations(x_dim);
+        if (hint_dim_iterations < NEScheduler::get().num_threads() && x_dim_iterations > hint_dim_iterations)
+        {
+            hint_dim = x_dim;
+        }
+        ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
+        NEScheduler::get().schedule_op(_im2col_kernel.get(), hint_dim, _im2col_kernel->window(), pack);
+        gemm_input_to_use = im2col_output.get();
+    }
+
+    // Handle the case where output has top/bottom padding
+    const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst;
+    Tensor         gemm3d;
+    _gemm_output_3d.extend_padding(out_to_use->info()->padding());
+    gemm3d.allocator()->soft_init(_gemm_output_3d);
+    gemm3d.allocator()->import_memory(out_to_use->buffer());
+    auto gemm_output_to_use = gemm_output.get();
+
+    if (_skip_im2col)
+    {
+        gemm_output_to_use = &gemm3d;
+    }
+    if (_skip_col2im && !out_has_padding)
+    {
+        gemm_output_to_use = dst;
+    }
+
+    ITensorPack gemm_pack = tensors;
+    gemm_pack.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+    gemm_pack.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
+    // Allocate reshaped weights if required
+    auto weights = gemm_pack.get_const_tensor(TensorType::ACL_SRC_1);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
+    // Re-interpreted weights. Only tensor shape is changed. Only memory import, no allocation
+    const bool use_reinterpreted_wei = (_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose);
+    CpuAuxTensorHandler reinterpreted_wei(
+        _weights_reshaped, *weights,
+        /* import only if we chose the ReinterpretThenTranspose path, because otherwise the weight may have been freed */
+        !use_reinterpreted_wei);
+
+    const bool          use_reshaped_wei = (_run_wt && (_wt_method == WeightTransformMethod::ReshapeThenTranspose ||
+                                               _wt_method == WeightTransformMethod::FusedReshapeAndTranspose));
+    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors,
+                                     false /* pack_inject */, !use_reshaped_wei /* bypass_alloc */,
+                                     !use_reshaped_wei /* bypass_import */
+    );
+    // Update the weights to use if it has been reshaped
+    if (use_reinterpreted_wei)
+    {
+        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
+    }
+    else if (use_reshaped_wei)
+    {
+        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+    }
+
+    // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
+    _is_quantized ? _mm_gemmlowp->run(gemm_pack) : _mm_gemm->run(gemm_pack);
+
+    // Reshape output matrix
+    if (!_skip_col2im)
+    {
+        if (_data_layout == DataLayout::NCHW)
+        {
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}};
+            NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
+        }
+        else
+        {
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
+            _reshape->run(pack);
+        }
+    }
+    else if (out_has_padding)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
+        _reshape->run(pack);
+    }
+}
+
+void CpuGemmConv2d::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        // Determine which weights reshape path to take
+        // Note that this decision can only occur at prepare instead of configure because it relies on the presence of
+        // any holes in the weight tensor, which may change after configure (e.g. from extending padding)
+        if (_run_wt)
+        {
+            _wt_method = get_wt_method(*(weights->info()));
+            switch (_wt_method)
+            {
+                case (WeightTransformMethod::FusedReshapeAndTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: FusedReshapeAndTranspose");
+                    _weights_reshape_and_transpose_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
+                    _weights_reshape_and_transpose_kernel->configure(weights->info(), nullptr, &_weights_reshaped);
+                    break;
+                }
+                case (WeightTransformMethod::ReshapeThenTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReshapeThenTranspose");
+                    _weights_reshape = std::make_unique<CpuReshape>();
+                    _weights_reshape->configure(weights->info(), &_weights_reshaped);
+                    break;
+                }
+                case (WeightTransformMethod::ReinterpretThenTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReinterpretThenTranspose");
+                    // Nothing to configure
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported weight transform method");
+                }
+            }
+        }
+        else
+        {
+            ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("No weight transformation is performed");
+        }
+        ITensorPack gemm_pack = tensors;
+        // Allocate reshaped weights if required
+        CpuAuxTensorHandler reinterpreted_wei(
+            _weights_reshaped,
+            *weights); // Re-interpreted weights. Only tensor shape is changed. No allocation
+        CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
+        // Run weights reshape if required
+        if (_run_wt)
+        {
+            switch (_wt_method)
+            {
+                case (WeightTransformMethod::FusedReshapeAndTranspose):
+                {
+                    ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}};
+                    NEScheduler::get().schedule_op(_weights_reshape_and_transpose_kernel.get(), Window::DimW,
+                                                   _weights_reshape_and_transpose_kernel->window(), pack);
+                    weights->mark_as_unused();
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+                    break;
+                }
+                case (WeightTransformMethod::ReshapeThenTranspose):
+                {
+                    ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}};
+                    _weights_reshape->run(pack);
+                    weights->mark_as_unused();
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+                    break;
+                }
+                case (WeightTransformMethod::ReinterpretThenTranspose):
+                {
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
+                    // Nothing to run
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported weight transform method");
+                }
+            }
+        }
+        _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack);
+
+        _is_prepared = true;
+    }
+}
+experimental::MemoryRequirements CpuGemmConv2d::workspace() const
+{
+    return _aux_mem;
+}
+bool CpuGemmConv2d::isVarWeightsKernel() const
+{
+    return _mm_gemm && _mm_gemm->isVarWeightsKernel();
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
new file mode 100644
index 0000000000..48a0d11107
--- /dev/null
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuGemm;
+class CpuGemmLowpMatrixMultiplyCore;
+class CpuGemmLowpOutputStage;
+class CpuReshape;
+namespace kernels
+{
+class CpuIm2ColKernel;
+class CpuCol2ImKernel;
+class CpuWeightsReshapeKernel;
+} // namespace kernels
+
+/** Basic function to compute the convolution layer. @ref note_CpuGemmConv2d_weight_transformation */
+class CpuGemmConv2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuGemmConv2d();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuGemmConv2d(const CpuGemmConv2d &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    CpuGemmConv2d(CpuGemmConv2d &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuGemmConv2d &operator=(const CpuGemmConv2d &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    CpuGemmConv2d &operator=(CpuGemmConv2d &&) = delete;
+    /** Destructor */
+    ~CpuGemmConv2d();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |BFLOAT16       |BFLOAT16           |BFLOAT16 |BFLOAT16       |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     *
+     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                              Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with CpuWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     */
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmConvolution::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
+
+    /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
+     *
+     * The parameter list is the same as @ref NEGEMMConvolutionLayer::has_opt_impl
+     *
+     * @return a status.
+     */
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const bool                 enable_fast_math = false);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    /** Configures the appropriate matrix multiply routine
+     *
+     * @param[in]  src              Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights          Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst              Output tensor info. Data types supported: Same as @p input,
+     *                              except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  gemm_3d_depth    (Optional) Depth of GEMM 3D (Defaults to 1)
+     * @param[in]  fixed_format     (Optional) Select GEMM execution with variable weights.
+     * @param[in]  weight_format    (Optional) The layout to be used for the weights tensor when running GEMM with variable weights.
+     */
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *output,
+                      const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                      bool                       enable_fast_math = false,
+                      int                        gemm_3d_depth    = 1,
+                      bool                       fixed_format     = false,
+                      arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+     *
+     * @param[in] src              Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights          Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] dst              Output tensor info. Data types supported: Same as @p input,
+     *                             except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] gemm_3d_depth    (Optional) Depth of GEMM 3D (Defaults to 1)
+     * @param[in] skip_im2col      (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+     * @param[in] fixed_format     (Optional) Select GEMM execution with variable weights.
+     * @param[in] weight_format    (Optional) The layout to be used for the weights tensor when running GEMM with variable weights.
+     *
+     * @return a status
+     */
+    static Status validate_mm(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                              bool                       enable_fast_math = false,
+                              int                        gemm_3d_depth    = 1,
+                              bool                       skip_im2col      = false,
+                              bool                       fixed_format     = false,
+                              arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
+    /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
+     *
+     * @param[in] src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights       Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] act_info      Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] gemm_3d_depth Depth of GEMM 3D
+     * @param[in] skip_im2col   Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
+     *
+     * @return a status
+     */
+    static Status validate_gemm3d(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ActivationLayerInfo &act_info,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col);
+
+    struct SkipInfo
+    {
+        bool skip_im2col;
+        bool skip_col2im;
+    };
+
+    /** Static function to provide skip_im2col and skip_col2im information.
+     *
+     * @param[in] src       Input tensor info.
+     * @param[in] weights   Weights tensor info.
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] dilation  Dilation, in elements, across x and y.
+     * @param[in] act_info  Activation layer information in case of a fused activation.
+     *
+     * @return a SkipInfo instance.
+     */
+    static SkipInfo skip_im_col_info(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const PadStrideInfo       &conv_info,
+                                     const Size2D              &dilation,
+                                     const ActivationLayerInfo &act_info);
+
+    /** Indicates if the convolution executes in variable weights mode.
+     *
+     * Similar to @ref CpuGemm::isVarWeightsKernel
+     */
+    bool isVarWeightsKernel() const;
+    enum AuxTensorIdx
+    {
+        GemmAsmPretransposedRHS  = 2, // CpuGemmAssemblyDispatch::Pretranspose
+        GemmTransposed1xWRHS     = 5, // CpuGemm::Transposed1xWRHS
+        GemmLowpTransposed1xWRHS = 6, // CpuGemmLowpMatrixMultiplyCore::TmpB
+        /* Slots 0 - 9 reserved and shared by CpuGemmLowpMatrixMultiplyCore and CpuGemm */
+        Im2ColOutput = 10,
+        WeightsReshaped,
+        GemmOutput,
+        Count
+    };
+
+    /** Weight transformation method. See @ref note_CpuGemmConv2d_weight_transformation */
+    enum class WeightTransformMethod
+    {
+        ReinterpretThenTranspose,
+        ReshapeThenTranspose,
+        FusedReshapeAndTranspose,
+    };
+
+    /** Select weight transformation method
+     *
+     * @param[in] weights Input weights
+     *
+     * @return WeightTransformMethod
+     */
+    static WeightTransformMethod get_wt_method(const ITensorInfo &weights);
+
+    std::unique_ptr<CpuReshape>                       _weights_reshape;
+    std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_and_transpose_kernel;
+    std::unique_ptr<kernels::CpuIm2ColKernel>         _im2col_kernel;
+    std::unique_ptr<CpuGemm>                          _mm_gemm;
+    std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
+    std::unique_ptr<kernels::CpuCol2ImKernel>         _col2im_kernel;
+    std::unique_ptr<CpuReshape>                       _reshape;
+
+    TensorInfo _im2col_output;
+    TensorInfo _weights_reshaped;
+    TensorInfo _gemm_output;
+    TensorInfo _gemm_output_3d;
+
+    DataLayout _data_layout;
+
+    bool                  _skip_im2col;
+    bool                  _skip_col2im;
+    bool                  _is_quantized;
+    bool                  _is_prepared;
+    WeightTransformMethod _wt_method;
+    bool                  _run_wt;
+
+    experimental::MemoryRequirements _aux_mem{Count};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
new file mode 100644
index 0000000000..9187927541
--- /dev/null
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+#include "support/Cast.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+
+namespace
+{
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const ITensorInfo         *dst,
+                                                        const ActivationLayerInfo &act)
+{
+    // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo        iqinfo    = src->quantization_info();
+    const QuantizationInfo        wqinfo    = weights->quantization_info();
+    const QuantizationInfo        oqinfo    = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+    const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
+    const DataType                data_type = src->data_type();
+    // Merge activation with output stage
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+    PixelValue type_min{};
+    PixelValue type_max{};
+    std::tie(type_min, type_max) = get_min_max(data_type);
+    int32_t min_activation       = type_min.get<int32_t>();
+    int32_t max_activation       = type_max.get<int32_t>();
+    if (supported_acts.count(act.activation()) != 0)
+    {
+        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
+    }
+    GEMMLowpOutputStageInfo os_info;
+    os_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    os_info.gemmlowp_offset          = uoqinfo.offset;
+    os_info.gemmlowp_min_bound       = min_activation;
+    os_info.gemmlowp_max_bound       = max_activation;
+    os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+    quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
+    return os_info;
+}
+cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+{
+    cpu::AsmGemmInfo asm_info;
+    asm_info.method                  = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv;
+    asm_info.ps_info                 = info.conv_info;
+    asm_info.activation_info         = info.act_info;
+    asm_info.depth_output_gemm3d     = true;
+    asm_info.reinterpret_input_as_3d = true;
+    asm_info.padding_top             = info.conv_info.pad_top();
+    asm_info.padding_left            = info.conv_info.pad_left();
+    asm_info.padding_value           = 0.f;
+    asm_info.negated_offsets         = false;
+    asm_info.fast_mode               = info.enable_fast_math;
+    asm_info.fixed_format            = info.weights_info.weight_format() != WeightFormat::UNSPECIFIED;
+    asm_info.weight_format           = info.weights_info.weight_format();
+    return asm_info;
+}
+} // namespace
+
+CpuGemmDirectConv2d::CpuGemmDirectConv2d()
+    : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>()),
+      _activation_func(std::make_unique<CpuActivation>()),
+      _weights_permute_func(std::make_unique<CpuPermute>()),
+      _aux_mem(AuxTensorIdx::Count),
+      _perm_weights(),
+      _run_activation(false),
+      _is_prepared(false)
+{
+}
+
+CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
+
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
+                                    const ITensorInfo *weights,
+                                    const ITensorInfo *biases,
+                                    ITensorInfo       *dst,
+                                    const Conv2dInfo  &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
+
+    _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
+    _is_prepared    = false;
+
+    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2});
+
+    // Configure assembly dispatch
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    if (is_data_type_quantized(src->data_type()))
+    {
+        asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
+    }
+    _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
+
+    // Configure activation
+    if (_run_activation)
+    {
+        _activation_func->configure(dst, nullptr, info.act_info);
+    }
+
+    // Add auxiliary memory requirements of the assembly dispatch
+    const auto asm_mem_req = _gemm_asm_func->workspace();
+    for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+    {
+        _aux_mem[slot] = asm_mem_req[slot];
+    }
+
+    if (_aux_mem[Pretranspose].size > 0)
+    {
+        // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+    }
+    else
+    {
+        // We must permute weights if they are WeightFormat::UNSPECIFIED
+        if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
+            _aux_mem[PermutedWeights] =
+                MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
+    }
+}
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
+                                     const ITensorInfo *weights,
+                                     const ITensorInfo *biases,
+                                     const ITensorInfo *dst,
+                                     const Conv2dInfo  &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    if (!is_fixed_format(info.weights_info.weight_format()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
+    const DataType    data_type = src->data_type();
+    const TensorShape i_shape   = src->tensor_shape();
+    const TensorShape w_shape   = weights->tensor_shape();
+    ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    // Validate biases
+    if (biases != nullptr)
+    {
+        if (is_data_type_quantized_asymmetric(data_type))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else if (data_type == DataType::BFLOAT16)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info));
+    return Status{};
+}
+void CpuGemmDirectConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    _gemm_asm_func->run(tensors);
+    if (_run_activation)
+    {
+        ITensor    *io = tensors.get_tensor(ACL_DST);
+        ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}};
+        _activation_func->run(pack);
+    }
+}
+
+void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        // If we are using fixed-format kernel the weights are already reshaped
+        if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
+        {
+            _gemm_asm_func->prepare(tensors);
+            _is_prepared = true;
+            return;
+        }
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
+
+        CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
+        _weights_permute_func->run(permute_tensors);
+
+        tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
+        // Call prepare of assembly dispatch
+        _gemm_asm_func->prepare(tensors);
+
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
new file mode 100644
index 0000000000..a7365615b9
--- /dev/null
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuPermute.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+struct Conv2dInfo;
+namespace cpu
+{
+class CpuGemmDirectConv2d : public ICpuOperator
+{
+public:
+    CpuGemmDirectConv2d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d);
+    ~CpuGemmDirectConv2d();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2           |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |S32            |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32            |QASYMM8_SIGNED |
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     * |BFLOAT16       |BFLOAT16       |BFLOAT16       |BFLOAT16       |
+     *
+     * @param[in] src     Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                    while every optional dimension from 4 and above represent a batch of inputs.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases  Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                    Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] dst     Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                    Data types supported: Same as @p input.
+     * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ITensorInfo *src,
+                   const ITensorInfo *weights,
+                   const ITensorInfo *biases,
+                   ITensorInfo       *dst,
+                   const Conv2dInfo  &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
+     *
+     * Similar to CpuGemmDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &info);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        GemmTemp0 = 0,
+        GemmTemp1,
+        Pretranspose,
+        /* Slots above (0-2) are reserved for CpuGemmAssemblyDispatch */
+        PermutedWeights,
+        Count
+    };
+
+    std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func;
+    std::unique_ptr<CpuActivation>           _activation_func;
+    std::unique_ptr<CpuPermute>              _weights_permute_func;
+    experimental::MemoryRequirements         _aux_mem;
+    TensorInfo                               _perm_weights;
+    bool                                     _run_activation;
+    bool                                     _is_prepared;
+};
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..f3396fbb5c
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,779 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+    cpu::AsmGemmInfo asm_info;
+    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+    asm_info.output_stage            = info.gemmlowp_output_stage();
+    asm_info.fast_mode               = info.fast_math();
+    asm_info.accumulate              = info.accumulate();
+
+    return asm_info;
+}
+} // namespace
+
+CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
+    : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
+      _mm_kernel(),
+      _mtx_a_reshape_kernel(),
+      _mtx_b_reshape_kernel(),
+      _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(),
+      _offset_contribution_output_stage_kernel(),
+      _activation_func(),
+      _convert_to_signed_asymm(),
+      _convert_from_signed_asymm(),
+      _vector_sum_col(),
+      _vector_sum_row(),
+      _tmp_a(),
+      _tmp_b(),
+      _mm_result_s32(),
+      _signed_a(),
+      _signed_output(),
+      _a_offset(0),
+      _b_offset(0),
+      _run_vector_matrix_multiplication(false),
+      _assembly_path(false),
+      _fused_assembly_path(false),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false),
+      _fuse_output_stage(false),
+      _run_activation(false),
+      _flip_signedness(false),
+      _gemm_info(),
+      _aux_mem(Count)
+{
+}
+CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
+
+void CpuGemmLowpMatrixMultiplyCore::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info);
+
+    const ITensorInfo *matrix_a = a;
+    const ITensorInfo *matrix_b = b;
+    GEMMInfo           info     = gemm_info;
+
+    // Set internal variables
+    _a_offset                         = a->quantization_info().uniform().offset;
+    _b_offset                         = b->quantization_info().uniform().offset;
+    _run_vector_matrix_multiplication = a->dimension(1) < 2;
+    _reshape_b_only_on_first_run      = b->are_values_constant();
+    _is_prepared                      = false;
+    _fused_assembly_path              = false;
+    _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&
+                       _reshape_b_only_on_first_run;
+    _gemm_info = gemm_info;
+
+    // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic).
+    // It is not needed if the datatype is symmetric, because there is no offset
+    bool a_offset_kernel_needed = _a_offset != 0 || a->quantization_info().is_dynamic();
+    bool b_offset_kernel_needed = _b_offset != 0 || b->quantization_info().is_dynamic();
+
+    _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
+
+    const ITensorInfo *a_to_use = a;
+
+    // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+    if (_flip_signedness)
+    {
+        const int32_t                 offset_correction = 128;
+        const DataType                dt                = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
+
+        _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
+        _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
+        a_to_use  = &_signed_a;
+        _a_offset = _signed_a.quantization_info().uniform().offset;
+
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(
+                                  QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+        // Output stage correction
+        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+        output_stage_corr.gemmlowp_offset         = _signed_output.quantization_info().uniform().offset;
+        output_stage_corr.gemmlowp_min_bound -= offset_correction;
+        output_stage_corr.gemmlowp_max_bound -= offset_correction;
+        info.set_gemmlowp_output_stage(output_stage_corr);
+
+        // Update matrix a
+        matrix_a = &_signed_a;
+    }
+
+    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+    if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        _fuse_output_stage = true;
+        _mm_result_s32     = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
+    }
+
+    // Initialize assembly kernel meta-data
+    const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+#ifdef __aarch64__
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    {
+        switch (a->data_type())
+        {
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::U8:
+            case DataType::S8:
+            {
+                if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+                    info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+                {
+                    auto c_info_to_use = c == nullptr ? nullptr : c;
+                    _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
+                    _fused_assembly_path = _asm_glue->is_configured();
+                }
+                else
+                {
+                    auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
+                    _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
+                }
+                _assembly_path = _asm_glue->is_configured();
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Datatype not supported");
+                break;
+            }
+        }
+    }
+#endif /* __aarch64__ */
+    if (!(_assembly_path || _run_vector_matrix_multiplication))
+    {
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
+
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        _tmp_a =
+            TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
+
+        // Configure interleave kernel
+        _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
+        _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
+
+        // Configure transpose kernel
+        _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
+        _mtx_b_reshape_kernel->configure(b, &_tmp_b);
+    }
+
+    if (!_fused_assembly_path)
+    {
+        // Build reduction info
+        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
+
+        if (a_offset_kernel_needed)
+        {
+            _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+            // Configure Matrix B reduction kernel
+            _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
+            _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
+        }
+
+        if (b_offset_kernel_needed)
+        {
+            _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
+
+            // Configure matrix A reduction kernel
+            _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
+            _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
+        }
+
+        if (_fuse_output_stage)
+        {
+            // Configure matrix multiply kernel
+            if (!_assembly_path)
+            {
+                _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
+            }
+
+            _offset_contribution_output_stage_kernel =
+                std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+            _offset_contribution_output_stage_kernel->configure(
+                &_mm_result_s32, a_offset_kernel_needed ? &_vector_sum_col : nullptr,
+                b_offset_kernel_needed ? &_vector_sum_row : nullptr, c, _flip_signedness ? &_signed_output : dst,
+                a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+            if (_flip_signedness)
+            {
+                _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
+                _convert_from_signed_asymm->configure(&_signed_output, dst);
+            }
+        }
+        else
+        {
+            // This scale is needed for the s8_f32 kernel where the multiplication output is dequantized to F32.
+            const float dequantize_scale =
+                (dst->data_type() == DataType::F32)
+                    ? a->quantization_info().uniform().scale * b->quantization_info().uniform().scale
+                    : 1.0f;
+            // Configure matrix multiply kernel
+            if (!_assembly_path)
+            {
+                _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, dst);
+            }
+            // Configure offset contribution kernel
+            _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
+            _offset_contribution_kernel->configure(dst, a_offset_kernel_needed ? &_vector_sum_col : nullptr,
+                                                   b_offset_kernel_needed ? &_vector_sum_row : nullptr,
+                                                   a_to_use->dimension(0), _a_offset, _b_offset, dequantize_scale);
+        }
+    }
+    // Configure activation
+    const ActivationLayerInfo &activation = gemm_info.activation_info();
+    _run_activation =
+        activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+    if (_run_activation)
+    {
+        _activation_func = std::make_unique<CpuActivation>();
+        _activation_func->configure(dst, nullptr, activation);
+    }
+
+    if (_assembly_path)
+    {
+        const auto asm_mem_req = _asm_glue->workspace();
+        for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = asm_mem_req[slot];
+        }
+    }
+
+    // Request memory for LHS and RHS reshape matrix
+    _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol),
+                                        !_fused_assembly_path && a_offset_kernel_needed && _reshape_b_only_on_first_run
+                                            ? MemoryLifetime::Persistent
+                                            : MemoryLifetime::Temporary,
+                                        _vector_sum_col.total_size());
+    _aux_mem[VectorSumRow] =
+        MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+    _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),
+                                _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                                _tmp_b.total_size());
+    _aux_mem[MMResultS32] =
+        MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+    _aux_mem[SignedOutput] =
+        MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+}
+
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && output->data_type() != DataType::F32 &&
+                                        gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+                                    "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (a)->dimension(0) != (b)->dimension(1),
+        "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    // When using accumulation(in place summation), for now, the only supported DataType for output is S32.
+    if (gemm_info.accumulate())
+    {
+#ifdef __arm__
+        ARM_COMPUTE_RETURN_ERROR_MSG("Accumulation is not supported for armv7");
+#endif /* __arm__ */
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE,
+                                        "Accumulation is not supported for output QASYMM8/QASYMM8_SIGNED");
+    }
+
+    GEMMInfo           info          = gemm_info;
+    const ITensorInfo *matrix_a_info = a;
+    const ITensorInfo *matrix_b_info = b;
+
+    const ITensorInfo *a_to_use = a;
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+    TensorInfo mm_result_s32_info{};
+
+    int32_t a_offset = a->quantization_info().uniform().offset;
+    int32_t b_offset = b->quantization_info().uniform().offset;
+
+    // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic).
+    bool a_offset_kernel_needed = a_offset != 0 || a->quantization_info().is_dynamic();
+    bool b_offset_kernel_needed = b_offset != 0 || b->quantization_info().is_dynamic();
+
+    bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+    if (fuse_output_stage)
+    {
+        auto_init_if_empty(mm_result_s32_info,
+                           a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+    }
+
+    // Convert QASYMM8->QASYMM8_SIGNED
+    TensorInfo signed_a{};
+    TensorInfo signed_output{};
+    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&
+                           (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+    if (flip_signedness)
+    {
+        const int32_t                 offset_correction = 128;
+        const DataType                dt                = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
+
+        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+        a_to_use = &signed_a;
+        a_offset = signed_a.quantization_info().uniform().offset;
+
+        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(
+                                   QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+        // Output stage correction
+        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+        output_stage_corr.gemmlowp_offset         = signed_output.quantization_info().uniform().offset;
+        output_stage_corr.gemmlowp_min_bound -= offset_correction;
+        output_stage_corr.gemmlowp_max_bound -= offset_correction;
+        info.set_gemmlowp_output_stage(output_stage_corr);
+
+        // Update matrix a
+        matrix_a_info = &signed_a;
+    }
+
+    // Initialize assembly kernel meta-data
+    const AsmGemmInfo asm_info = init_assembly_metadata(info);
+
+    // Check if we need to run the optimized assembly kernel
+    bool run_optimised             = false;
+    bool run_optimised_requantized = false;
+
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    {
+        if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+            info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            run_optimised             = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
+            run_optimised_requantized = run_optimised;
+        }
+        else
+        {
+            run_optimised = bool(CpuGemmAssemblyDispatch::validate(
+                a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+        }
+    }
+
+    if (run_optimised)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+        if (info.depth_output_gemm3d() != 0)
+        {
+            if (info.reinterpret_input_as_3d())
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+            }
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+                                        "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+                                        "NEGEMM cannot reinterpret the output tensor as 3D");
+
+        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+        if (!run_vector_matrix_multiplication)
+        {
+            matrix_a_info = &tmp_a_info;
+            matrix_b_info = &tmp_b_info;
+
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->tensor_shape();
+            shape_tmp_a.set(0, a->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->tensor_shape();
+            shape_tmp_b.set(0, b->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+            // Validate interleave kernel
+            auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+        }
+    }
+
+    if (!run_optimised_requantized)
+    {
+        TensorInfo info_vector_sum_col{};
+        TensorInfo info_vector_sum_row{};
+
+        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
+
+        // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+        if (a_offset_kernel_needed)
+        {
+            info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+            // Configure Matrix B reduction kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+        }
+
+        // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+        if (b_offset_kernel_needed)
+        {
+            info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+            // Configure matrix A reduction kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+        }
+
+        if (fuse_output_stage)
+        {
+            if (!run_optimised)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info));
+            }
+
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset_kernel_needed ? &info_vector_sum_col : nullptr,
+                b_offset_kernel_needed ? &info_vector_sum_row : nullptr, c, flip_signedness ? &signed_output : output,
+                a_offset, b_offset, info.gemmlowp_output_stage()));
+        }
+        else
+        {
+            if (!run_optimised)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+                ARM_COMPUTE_RETURN_ON_ERROR(
+                    kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+            }
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(
+                output, a_offset_kernel_needed ? &info_vector_sum_col : nullptr,
+                b_offset_kernel_needed ? &info_vector_sum_row : nullptr, a_offset, b_offset));
+        }
+    }
+
+    // Validate activation
+    const ActivationLayerInfo &activation = gemm_info.activation_info();
+    if (activation.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
+    }
+
+    return Status{};
+}
+
+void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    auto a        = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto b        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto c        = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto dst      = tensors.get_tensor(TensorType::ACL_DST);
+    auto a_to_use = a;
+    auto matrix_a = a;
+    auto matrix_b = b;
+
+    CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
+    CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
+    CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
+    CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
+    CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
+    CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
+    CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
+
+    const QuantizationInfo a_qinfo = a->info()->quantization_info();
+    const QuantizationInfo b_qinfo = b->info()->quantization_info();
+
+    if (a_qinfo.is_dynamic())
+        _a_offset = a_qinfo.uniform().offset;
+    if (b_qinfo.is_dynamic())
+        _b_offset = b_qinfo.uniform().offset;
+
+    // Convert QASYMM8->QASYMM8_SIGNED
+    if (_flip_signedness)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};
+        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),
+                                       pack);
+        a_to_use = signed_a.get();
+        matrix_a = signed_a.get();
+    }
+
+    // Run GEMM
+    if (_asm_glue->is_configured())
+    {
+        ITensorPack asm_glue_tensors = tensors;
+        auto        output_to_use    = (_fuse_output_stage ? mm_result_s32.get() : dst);
+        if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&
+            _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
+            asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
+        }
+        else
+        {
+            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+            asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+            asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
+        }
+        _asm_glue->run(asm_glue_tensors);
+    }
+    else
+    {
+        if (!_run_vector_matrix_multiplication)
+        {
+            matrix_a = tmp_a.get();
+            matrix_b = tmp_b.get();
+            // Run interleave kernel
+            ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),
+                                           pack_a);
+
+            if (!_reshape_b_only_on_first_run)
+            {
+                ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};
+                // Run transpose kernel
+                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,
+                                               _mtx_b_reshape_kernel->window(), pack_b);
+            }
+        }
+        ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};
+        if (_fuse_output_stage)
+        {
+            pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
+        }
+        else
+        {
+            pack_mm.add_tensor(TensorType::ACL_DST, dst);
+        }
+        NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
+    }
+
+    if (!_fused_assembly_path)
+    {
+        // Run matrix A reduction kernel only if _b_offset is not equal to 0
+        if (_b_offset != 0)
+        {
+            ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,
+                                           _mtx_a_reduction_kernel->window(), pack);
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+        {
+            ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
+        }
+
+        if (_fuse_output_stage)
+        {
+            if (a_qinfo.is_dynamic())
+                _offset_contribution_output_stage_kernel->set_a_offset(_a_offset);
+            if (b_qinfo.is_dynamic())
+                _offset_contribution_output_stage_kernel->set_b_offset(_b_offset);
+
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
+            pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
+            pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
+            pack.add_tensor(TensorType::ACL_SRC_3, c);
+            pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
+
+            // Run offset contribution kernel
+            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,
+                                           _offset_contribution_output_stage_kernel->window(), pack);
+        }
+        else
+        {
+            if (a_qinfo.is_dynamic())
+                _offset_contribution_kernel->set_a_offset(_a_offset);
+            if (b_qinfo.is_dynamic())
+                _offset_contribution_kernel->set_b_offset(_b_offset);
+            if (a_qinfo.is_dynamic() || b_qinfo.is_dynamic())
+            {
+                const float dequantize_scale = a_qinfo.uniform().scale * b_qinfo.uniform().scale;
+                _offset_contribution_kernel->set_scale(dequantize_scale);
+            }
+
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
+            pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
+            pack.add_tensor(TensorType::ACL_DST, dst);
+
+            // Run offset contribution kernel
+            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,
+                                           _offset_contribution_kernel->window(), pack);
+        }
+    }
+
+    // Convert QASYMM8_SIGNED->QASYMM8
+    if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};
+        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,
+                                       _convert_from_signed_asymm->window(), pack);
+    }
+
+    // Run fused activation unless already run in the fused assembly
+    if (_run_activation)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
+        _activation_func->run(pack);
+    }
+}
+
+void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        // Run assembly reshape
+        if (_asm_glue->is_configured())
+        {
+            _asm_glue->prepare(tensors);
+        }
+        // Run non-assembly reshape
+        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+        {
+            // Run reshape kernel and mark original weights tensor as unused
+            ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+            CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),
+                                           pack);
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            ITensor *vector_sum_col_p =
+                utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+            CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
+        }
+        _is_prepared = true;
+    }
+}
+experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..38121c9bb4
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/GEMMInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmInterleave4x4Kernel;
+class CpuGemmLowpMatrixMultiplyKernel;
+class CpuGemmLowpOffsetContributionKernel;
+class CpuGemmLowpOffsetContributionOutputStageKernel;
+class CpuGemmLowpMatrixAReductionKernel;
+class CpuGemmLowpMatrixBReductionKernel;
+class CpuGemmTranspose1xWKernel;
+class CpuConvertQuantizedSignednessKernel;
+} // namespace kernels
+class CpuGemmAssemblyDispatch;
+class CpuActivation;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
+ *
+ *  -# @ref kernels::CpuGemmInterleave4x4Kernel
+ *  -# @ref kernels::CpuGemmTranspose1xWKernel
+ *  -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel
+ *  -# @ref kernels::CpuGemmLowpOffsetContributionKernel
+ *  -# @ref CpuActivation
+ *
+ * otherwise if the DOT product instruction is available:
+ *
+ *  -# @ref kernels::CpuGemmLowpOffsetContributionKernel
+ *
+*/
+class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuGemmLowpMatrixMultiplyCore();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore);
+    /** Destructor */
+    ~CpuGemmLowpMatrixMultiplyCore();
+    /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
+     * |QASYMM8        |QASYMM8            |S32      |S32            |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8        |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |F32      |F32            |
+     *
+     * @note GEMM_LOWP:  low precision GEMM kernel
+     *  This kernel performs the following computations:
+     *
+     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+     *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+     *  -# Compute the matrix product of the resulting a * b in int32.
+     *
+     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise
+     *
+     * @param[in]  a         First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+     * @param[in]  c         Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32/F32
+     * @param[out] dst       Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32
+     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                       if the reshape of matrix B should be executed only for the first run
+     */
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *dst,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *dst,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        VectorSumCol = 3,
+        VectorSumRow,
+        TmpA,
+        TmpB,
+        MMResultS32,
+        SignedA,
+        SignedOutput,
+        Count
+    };
+
+    std::unique_ptr<CpuGemmAssemblyDispatch>                                 _asm_glue;
+    std::unique_ptr<kernels::CpuGemmLowpMatrixMultiplyKernel>                _mm_kernel;
+    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
+    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>                      _mtx_b_reshape_kernel;
+    std::unique_ptr<kernels::CpuGemmLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
+    std::unique_ptr<kernels::CpuGemmLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
+    std::unique_ptr<kernels::CpuGemmLowpOffsetContributionKernel>            _offset_contribution_kernel;
+    std::unique_ptr<kernels::CpuGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+    std::unique_ptr<CpuActivation>                                           _activation_func;
+    std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
+    std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
+
+    TensorInfo _vector_sum_col;
+    TensorInfo _vector_sum_row;
+    TensorInfo _tmp_a;
+    TensorInfo _tmp_b;
+    TensorInfo _mm_result_s32;
+    TensorInfo _signed_a;
+    TensorInfo _signed_output;
+    int32_t    _a_offset;
+    int32_t    _b_offset;
+
+    bool                             _run_vector_matrix_multiplication;
+    bool                             _assembly_path;
+    bool                             _fused_assembly_path;
+    bool                             _reshape_b_only_on_first_run;
+    bool                             _is_prepared;
+    bool                             _fuse_output_stage;
+    bool                             _run_activation;
+    bool                             _flip_signedness;
+    GEMMInfo                         _gemm_info;
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
new file mode 100644
index 0000000000..4215eed199
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuGemmLowpOutputStage::configure(ITensorInfo                   *src,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
+                                       const GEMMLowpOutputStageInfo &info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
+    ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
+
+    switch (info.type)
+    {
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+        {
+            switch (info.output_data_type)
+            {
+                case DataType::QASYMM8:
+                {
+                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    _kernel = std::move(k);
+                    break;
+                }
+                case DataType::QASYMM8_SIGNED:
+                {
+                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    _kernel = std::move(k);
+                    break;
+                }
+                case DataType::QSYMM16:
+                {
+                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound,
+                                 info.gemmlowp_max_bound);
+                    _kernel = std::move(k);
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported output data type.");
+                    break;
+                }
+            }
+            break;
+        }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+        {
+            switch (info.output_data_type)
+            {
+                case DataType::QASYMM8:
+                case DataType::QASYMM8_SIGNED:
+                {
+                    auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel>();
+                    k->configure(src, bias, dst, &info);
+                    _kernel = std::move(k);
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported output data type.");
+                    break;
+                }
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
+    }
+}
+
+Status CpuGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                        const ITensorInfo             *bias,
+                                        const ITensorInfo             *dst,
+                                        const GEMMLowpOutputStageInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN,
+                                    "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) &&
+                                (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+
+    switch (info.type)
+    {
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+        {
+            switch (dst->data_type())
+            {
+                case DataType::QASYMM8:
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                case DataType::QASYMM8_SIGNED:
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                case DataType::QSYMM16:
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                default:
+                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
+            }
+        }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+        {
+            switch (dst->data_type())
+            {
+                case DataType::QASYMM8:
+                case DataType::QASYMM8_SIGNED:
+                    return kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info);
+                default:
+                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
+            }
+        }
+        default:
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
+    }
+}
+
+void CpuGemmLowpOutputStage::run(ITensorPack &tensors)
+{
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h
new file mode 100644
index 0000000000..e5e2f41fa9
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+/** This file contains all available output stages for GEMMLowp.
+ *
+ *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore),
+ *  and processes it to obtain the final ASYMM8 value.
+ *
+ *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to execute GEMMLowpQuantizeDown kernels.
+ *
+ *  This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+*/
+class CpuGemmLowpOutputStage : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst           |
+     * |:--------------|:-------------|:-------------|
+     * |S32            |S32           |QASYMM8       |
+     * |S32            |S32           |QASYMM8_SIGNED|
+     * |S32            |S32           |QSYMM16       |
+     *
+     * @param[in]  src  Input tensor info. Data type supported: S32
+     * @param[in]  bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                  Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] dst  Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
+     * @param[in]  info GEMMLowp output stage metadata.
+     */
+    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmLowpOutputStage::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H */
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
new file mode 100644
index 0000000000..f68ae9883f
--- /dev/null
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/operators/CpuMatMul.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/utils/quantization/AsymmHelpers.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
+{
+    const auto                    data_type = src->data_type();
+    const QuantizationInfo        oq_info   = dst->quantization_info();
+    const UniformQuantizationInfo iq_unif   = src->quantization_info().uniform();
+    const UniformQuantizationInfo wq_unif   = weights->quantization_info().uniform();
+    const UniformQuantizationInfo oq_unif   = oq_info.uniform();
+
+    float   multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
+    int32_t output_multiplier;
+    int32_t output_shift;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
+    std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
+
+    gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
+    gemmlowp_output_stage_info.gemmlowp_shift      = output_shift;
+    gemmlowp_output_stage_info.gemmlowp_offset     = oq_unif.offset;
+    gemmlowp_output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage_info.gemmlowp_min_bound  = type_min;
+    gemmlowp_output_stage_info.gemmlowp_max_bound  = type_max;
+
+    return Status{};
+}
+} // namespace
+
+CpuMatMul::CpuMatMul()
+    : _transpose_kernel_lhs(),
+      _transpose_kernel_rhs(),
+      _asm_glue(),
+      _lhs_transposed(),
+      _rhs_transposed(),
+      _original_lhs_shape(),
+      _original_rhs_shape(),
+      _original_dst_shape()
+{
+}
+
+Status CpuMatMul::validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
+                           const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::BFLOAT16,
+                                                         DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(lhs);
+
+    const auto adj_lhs = info.adj_lhs();
+    const auto adj_rhs = info.adj_rhs();
+
+    const ITensorInfo *lhs_to_use = lhs;
+    const ITensorInfo *rhs_to_use = rhs;
+    TensorInfo         lhs_transposed{};
+    TensorInfo         rhs_transposed{};
+
+    auto gemm_info            = AsmGemmInfo();
+    gemm_info.activation_info = act_info;
+    gemm_info.fast_mode       = settings.fast_math();
+    gemm_info.fixed_format    = settings.fixed_format();
+
+    // Validate and then permute a/b
+    if (adj_lhs)
+    {
+        auto_init_if_empty(lhs_transposed,
+                           lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
+        // Assign lhs_to_use pointer to use transposed TensorInfo
+        lhs_to_use = &lhs_transposed;
+    }
+    if (adj_rhs)
+    {
+        auto_init_if_empty(rhs_transposed,
+                           rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
+        // Assign rhs_to_use pointer to use transposed TensorInfo
+        rhs_to_use = &rhs_transposed;
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
+                                    "The product AB is defined only if the number of columns in A is equal to the "
+                                    "number of rows in B (after transpose)");
+
+    // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
+    for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
+                                        "Broadcasting in Batch dimension is unsupported by this operator.");
+    }
+
+    // Quantized-specific configuration
+    if (is_data_type_quantized(lhs->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
+                                                                   gemm_info.activation_info, gemm_info.output_stage));
+    }
+
+    if (gemm_info.fixed_format)
+    {
+        gemm_info.weight_format                          = WeightFormat::ANY;
+        arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY;
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, lhs_to_use,
+                                                                               rhs_to_use, nullptr, dst, gemm_info));
+    }
+
+    cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
+
+    return Status{};
+}
+
+void CpuMatMul::configure(ITensorInfo               *lhs,
+                          ITensorInfo               *rhs,
+                          ITensorInfo               *dst,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuMatMul::validate(lhs, rhs, dst, info, settings));
+
+    _adj_lhs   = info.adj_lhs();
+    _adj_rhs   = info.adj_rhs();
+    _fast_math = settings.fast_math();
+
+    // 1. Create and reshape tensors
+    // ------------------------------------------------------
+    // a. Clone TensorInfo to prevent changing original tensor values during setup
+    // b. Change shape of lhs/dst to [x, y, 1, collapsed(z)] to match assembly kernel configuration
+    // c. For rhs collapse all dimensions larger than 3 to z dimension
+    TensorInfo lhs_to_use = *lhs->clone();
+    TensorInfo dst_to_use = *dst->clone();
+    TensorInfo rhs_to_use = *rhs->clone();
+
+    // Save starting shape of tensors
+    _original_lhs_shape = lhs_to_use.tensor_shape();
+    _original_dst_shape = dst_to_use.tensor_shape();
+    _original_rhs_shape = rhs_to_use.tensor_shape();
+
+    // Reshape lhs for use with assembly kernels.
+    lhs_to_use.set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
+    dst_to_use.set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
+    rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
+
+    // 2.  Configuration for transpose of lhs/rhs
+    // ------------------------------------------------------
+    // Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
+    if (_adj_lhs)
+    {
+        // Setup transpose LHS
+        _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
+        _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
+    }
+
+    if (_adj_rhs)
+    {
+        // Setup transpose RHS
+        _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
+        _transpose_kernel_rhs->configure(&rhs_to_use, &_rhs_transposed);
+    }
+
+    // 3. Configure assembly kernel using transposed tensors.
+    // -----------------------------------------------------
+    // Use transposed tensors if the corresponding transpose flags are set
+    // Fill AsmGemmInfo class object before configuration
+    _gemm_info.activation_info = act_info;
+    _gemm_info.fast_mode       = settings.fast_math();
+    _gemm_info.fixed_format    = settings.fixed_format();
+    _gemm_info.negated_offsets = false;
+
+    lhs_to_use = (_adj_lhs) ? _lhs_transposed : lhs_to_use;
+    rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
+
+    // Quantized-specific configuration
+    if (is_data_type_quantized(lhs->data_type()))
+    {
+        get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
+                                       _gemm_info.output_stage);
+    }
+
+    if (_gemm_info.fixed_format)
+    {
+        _gemm_info.weight_format                         = WeightFormat::ANY;
+        arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY;
+        ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use,
+                                                                              &rhs_to_use, nullptr, dst, _gemm_info));
+        // Set gemm weights info to the one returned by has_opt_impl
+        _gemm_info.weight_format = expected_weight_format;
+        // has_opt_impl may return a non fast math kernel, even if we requested one
+        _gemm_info.fast_mode = arm_compute::is_fixed_format_fast_math(expected_weight_format);
+    }
+
+    // Configure Asm Kernel
+    _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
+    _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
+                         _gemm_info); // c is nullptr as bias not supported in MatMul
+
+    // Specify memory requirements for intermediate tensors
+    auto asm_mem_req = _asm_glue->workspace();
+    // Specify memory required by gemm kernel
+    int idx = 0;
+    for (const auto &aux : asm_mem_req)
+    {
+        _aux_mem[idx] = aux;
+        idx++;
+    }
+    // Memory requirements for transposed tensors
+    _aux_mem[TransposeLHS] = MemoryInfo(offset_int_vec(TransposeLHS), MemoryLifetime::Temporary, lhs->total_size());
+    _aux_mem[TransposeRHS] = MemoryInfo(offset_int_vec(TransposeRHS), MemoryLifetime::Temporary, rhs->total_size());
+}
+
+void CpuMatMul::run(ITensorPack &tensors)
+{
+    // Retrieve tensors from tensor pack
+    auto lhs = tensors.get_tensor(ACL_SRC_0);
+    auto rhs = tensors.get_const_tensor(ACL_SRC_1);
+    auto dst = tensors.get_tensor(ACL_DST);
+
+    // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
+    // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
+    lhs->info()->set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
+                    _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    dst->info()->set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
+                    _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
+
+    // Initialise object to handle stored transposed tensors in auxillary memory
+    CpuAuxTensorHandler lhs_transposed(offset_int_vec(TransposeLHS), _lhs_transposed, tensors, true);
+    CpuAuxTensorHandler rhs_transposed(offset_int_vec(TransposeRHS), _rhs_transposed, tensors, true);
+
+    // Create tensor pack for asm kernel
+    ITensorPack asm_tensors(tensors);
+
+    // Run transpose lhs if necessary
+    if (_adj_lhs)
+    {
+        ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
+                                       lhs_transpose_pack);
+        asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
+    }
+    // Run transpose rhs if necessary
+    if (_adj_rhs)
+    {
+        ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
+                                       rhs_transpose_pack);
+        asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
+    }
+    // Run asm kernel
+    _asm_glue->run(asm_tensors);
+
+    // Undo reshape of tensors
+    dst->info()->set_tensor_shape(_original_dst_shape);
+    lhs->info()->set_tensor_shape(_original_lhs_shape);
+    rhs->info()->set_tensor_shape(_original_rhs_shape);
+}
+
+experimental::MemoryRequirements CpuMatMul::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
new file mode 100644
index 0000000000..2b1b4cf0ff
--- /dev/null
+++ b/src/cpu/operators/CpuMatMul.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
+#define ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+namespace arm_compute
+{
+// Forward Declarations
+class MatMulInfo;
+class CpuMatMulSettings;
+
+namespace cpu
+{
+/** Function to execute MatMul Operation. This function calls the following functions/kernels:
+ *
+ * If adjoint/adj flag is enabled for either input lhs or rhs (or both) :
+ *  -# @ref cpu::kernels::CpuTransposeKernel
+ * Then :
+ *  -# @ref cpu::CpuGemmAssemblyDispatch
+ */
+class CpuMatMul : public ICpuOperator
+{
+public:
+    /* Constructor */
+    CpuMatMul();
+    /* Destructor */
+    ~CpuMatMul() = default;
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMatMul);
+    /** Configure operator for a given list of arguments
+     *
+     * Note: Check documentation of @ref NEMatMul for a list of supported datatypes and layouts
+     *
+     *
+     * @param[in]  lhs      Left-hand side tensor info.
+     * @param[in]  rhs      Right-hand side tensor info.
+     * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  settings The settings for matmul operation (i.e fast math)
+     * @param[in]  act_info Class containing information about fused activation function.
+     */
+    void configure(ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuMatMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum InternalTensorIdx
+    {
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        TransposeLHS = 3,
+        TransposeRHS,
+        Count
+    };
+
+    // Define unique pointers to kernels/operators used by matmul
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr};
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>     _asm_glue{nullptr};
+
+    // TensorInfo for tensors stored in auxillary memory
+    TensorInfo _lhs_transposed{};
+    TensorInfo _rhs_transposed{};
+
+    // Original tensor shapes prior to reshaping tensors and collapsing dimensions
+    TensorShape _original_lhs_shape{};
+    TensorShape _original_rhs_shape{};
+    TensorShape _original_dst_shape{};
+
+    // Note : adj_lhs means the same as transposing lhs
+    bool                             _adj_lhs{false};
+    bool                             _adj_rhs{false};
+    bool                             _fast_math{false};
+    AsmGemmInfo                      _gemm_info{};
+    experimental::MemoryRequirements _aux_mem{Count};
+};
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp
new file mode 100644
index 0000000000..697fc40ab3
--- /dev/null
+++ b/src/cpu/operators/CpuMaxUnpooling.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuMaxUnpooling.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuMaxUnpooling::configure(const ITensorInfo      *src,
+                                const ITensorInfo      *indices,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info);
+    auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>();
+    k->configure(src, indices, dst, pool_info);
+    _kernel = std::move(k);
+}
+
+Status CpuMaxUnpooling::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *indices,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info)
+{
+    return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h
new file mode 100644
index 0000000000..5dc00bce9e
--- /dev/null
+++ b/src/cpu/operators/CpuMaxUnpooling.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_MAXUNPOOLING_H
+#define ARM_COMPUTE_CPU_MAXUNPOOLING_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuMaxUnpoolingLayerKernel */
+class CpuMaxUnpooling : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  src       Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices   Tensor containing the offset to store the src elements in the dst tensor.
+     *                       @ref CpuMaxUnpooling with indices should precede this function in order to
+     *                       properly reconstruct the output tensor.
+     *                       The tensor shape of this tensor has to be equal to the src tensor shape. Data type supported: U32.
+     * @param[out] dst       Destination tensor. Data types supported: Same as @p src
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuMaxUnpooling::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_MAXUNPOOLING_H  */
diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp
new file mode 100644
index 0000000000..ac9847111d
--- /dev/null
+++ b/src/cpu/operators/CpuMul.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuMul.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuMulKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+Status CpuMul::validate(const ITensorInfo         *src1,
+                        const ITensorInfo         *src2,
+                        const ITensorInfo         *dst,
+                        float                      scale,
+                        ConvertPolicy              overflow_policy,
+                        RoundingPolicy             rounding_policy,
+                        const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
+}
+
+void CpuMul::configure(ITensorInfo               *src1,
+                       ITensorInfo               *src2,
+                       ITensorInfo               *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
+                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+
+    auto k = std::make_unique<kernels::CpuMulKernel>();
+    k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy);
+    _kernel = std::move(k);
+}
+
+void CpuMul::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    auto split_dimension = static_cast<kernels::CpuMulKernel *>(_kernel.get())->get_split_dimension_hint();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
+
+Status CpuComplexMul::validate(const ITensorInfo         *src1,
+                               const ITensorInfo         *src2,
+                               const ITensorInfo         *dst,
+                               const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
+}
+
+void CpuComplexMul::configure(ITensorInfo               *src1,
+                              ITensorInfo               *src2,
+                              ITensorInfo               *dst,
+                              const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+
+    auto k = std::make_unique<kernels::CpuComplexMulKernel>();
+    k->configure(src1, src2, dst);
+    _kernel = std::move(k);
+}
+
+void CpuComplexMul::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h
new file mode 100644
index 0000000000..82b309830b
--- /dev/null
+++ b/src/cpu/operators/CpuMul.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_MUL_H
+#define ARM_COMPUTE_CPU_MUL_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuMulKernel */
+class CpuMul : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, dst and convertion policy.
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in, out] src1            First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32).
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             dst tensor info. Data types supported:
+     *                                 - U8, only if both inputs are U8.
+     *                                 - QASYMM8, only if both inputs are QASYMM8.
+     *                                 - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED.
+     *                                 - S16.
+     *                                 - QSYMM16, only if both inputs are QSYMM16.
+     *                                 - S32, only if both inputs are S32 or both are QSYMM16.
+     *                                 - F16, only if @p src1 is F16.
+     *                                 - F32, only if both inputs are F32.
+     * @param[in]      scale           Scale to apply after multiplication.
+     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     *                                 If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
+     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+     * @param[in]      rounding_policy Rounding policy.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+
+/** Basic function to run @ref kernels::CpuComplexMulKernel */
+class CpuComplexMul : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, dst.
+     *
+     * @param[in, out] src1     First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2     Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst      The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
+     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuComplexMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_MUL_H */
diff --git a/src/cpu/operators/CpuPRelu.h b/src/cpu/operators/CpuPRelu.h
new file mode 100644
index 0000000000..084474e2ba
--- /dev/null
+++ b/src/cpu/operators/CpuPRelu.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_PRELU_H
+#define ARM_COMPUTE_CPU_PRELU_H
+
+#include "src/cpu/operators/CpuElementwise.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */
+using CpuPRelu = CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_PRELU_H */
+\ No newline at end of file
diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp
new file mode 100644
index 0000000000..25acc92d00
--- /dev/null
+++ b/src/cpu/operators/CpuPermute.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuPermute.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPermuteKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
+    auto k = std::make_unique<kernels::CpuPermuteKernel>();
+    k->configure(src, dst, perm);
+    _kernel = std::move(k);
+}
+
+Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+    return kernels::CpuPermuteKernel::validate(src, dst, perm);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPermute.h b/src/cpu/operators/CpuPermute.h
new file mode 100644
index 0000000000..0e0f3ae8db
--- /dev/null
+++ b/src/cpu/operators/CpuPermute.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_PERMUTE_H
+#define ARM_COMPUTE_CPU_PERMUTE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuPermuteKernel */
+class CpuPermute : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in]  src  Source tensor to permute. Data types supported: All
+     * @param[out] dst  Destintation tensor. Data types supported: Same as @p src
+     * @param[in]  perm Permutation vector
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuPermute::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_PERMUTE_H */
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
new file mode 100644
index 0000000000..b72bde6978
--- /dev/null
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuPool2d.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPool2dKernel.h"
+#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPool2d::CpuPool2d()
+    : _pooling_layer_kernel(),
+      _asm_glue(),
+      _is_global_pooling_layer(false),
+      _use_kernel_indices(false),
+      _data_layout(DataLayout::NCHW),
+      _aux_mem(1)
+{
+}
+
+CpuPool2d::~CpuPool2d() = default;
+
+void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices);
+
+    // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+    // Get data layout
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+
+    // Check if we have Global Pooling Layer
+    const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) &&
+                               (src->dimension(idx_height) == pool_info.pool_size.height);
+    _use_kernel_indices = pool_info.use_kernel_indices;
+
+    if (run_optimised)
+    {
+        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+        const unsigned int num_threads = NEScheduler::get().num_threads();
+
+        auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
+        ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
+        pooling_wrapper->configure(src, dst, pool_info, ci);
+
+        // Get kernel's memory requirements
+        constexpr size_t alignment      = 4096;
+        const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
+        _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
+
+        _asm_glue = std::move(pooling_wrapper);
+    }
+    else
+    {
+        // Configure pooling kernel
+        auto k = std::make_unique<kernels::CpuPool2dKernel>();
+        k->configure(src, dst, pool_info, indices);
+        _pooling_layer_kernel = std::move(k);
+    }
+}
+
+Status CpuPool2d::validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices)
+{
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+    if (run_optimised)
+    {
+        return Status{};
+    }
+
+    return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices);
+}
+
+void CpuPool2d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
+
+    if (_asm_glue)
+    {
+        const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
+        NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
+    }
+    else
+    {
+        switch (_data_layout)
+        {
+            case DataLayout::NCHW:
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               _is_global_pooling_layer ? Window::DimZ : Window::DimY,
+                                               _pooling_layer_kernel->window(), tensors);
+                break;
+            case DataLayout::NHWC:
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               (_use_kernel_indices ? Window::DimY : Window::DimX),
+                                               _pooling_layer_kernel->window(), tensors);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data layout not supported");
+        }
+    }
+}
+
+experimental::MemoryRequirements CpuPool2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h
new file mode 100644
index 0000000000..ea73e3f335
--- /dev/null
+++ b/src/cpu/operators/CpuPool2d.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_H
+#define ARM_COMPUTE_CPU_POOL2D_H
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward Declarations
+struct PoolingLayerInfo;
+
+namespace cpu
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref kernels::CpuPool2dKernel
+ * -# @ref kernels::CpuPool2dAssemblyWrapperKernel
+ */
+class CpuPool2d : public ICpuOperator
+{
+public:
+    CpuPool2d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d);
+    ~CpuPool2d();
+    /** Set the src and dst tensors.
+     *
+     * @note F16 is supported for pool sizes 2 and 3 only
+     *
+     * @param[in, out] src       Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]     dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    std::unique_ptr<INEKernel> _pooling_layer_kernel;
+    std::unique_ptr<INEKernel> _asm_glue;
+
+    bool                             _is_global_pooling_layer;
+    bool                             _use_kernel_indices;
+    DataLayout                       _data_layout;
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL2D_H */
diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
new file mode 100644
index 0000000000..7fa78c1f80
--- /dev/null
+++ b/src/cpu/operators/CpuPool3d.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuPool3d.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPool3dKernel.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPool3d::CpuPool3d() : _aux_mem(1)
+{
+}
+
+CpuPool3d::~CpuPool3d() = default;
+
+void CpuPool3d::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info);
+
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::CpuPool3dKernel>();
+    k->configure(src, dst, pool_info);
+    _kernel = std::move(k);
+}
+
+Status CpuPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    return kernels::CpuPool3dKernel::validate(src, dst, pool_info);
+}
+
+void CpuPool3d::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
+
+    Scheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+
+experimental::MemoryRequirements CpuPool3d::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
new file mode 100644
index 0000000000..235d798095
--- /dev/null
+++ b/src/cpu/operators/CpuPool3d.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL3D_H
+#define ARM_COMPUTE_CPU_POOL3D_H
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuPool3dKernel
+ */
+class CpuPool3d : public ICpuOperator
+{
+public:
+    CpuPool3d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3d);
+    ~CpuPool3d();
+    /** Set the src and dst tensors.
+     *
+     *
+     * @param[in]  src       Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst       Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuPool3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL3D_H */
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
new file mode 100644
index 0000000000..4a3f1827c7
--- /dev/null
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/operators/CpuQuantize.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuQuantizeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst));
+    return Status{};
+}
+
+void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+
+    // Configure quantize kernel
+    auto k = std::make_unique<kernels::CpuQuantizeKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+void CpuQuantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    auto split_dimension = static_cast<kernels::CpuQuantizeKernel *>(_kernel.get())->get_split_dimension_hint();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuQuantize.h b/src/cpu/operators/CpuQuantize.h
new file mode 100644
index 0000000000..ec1134fee4
--- /dev/null
+++ b/src/cpu/operators/CpuQuantize.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_QUANTIZE_H
+#define ARM_COMPUTE_CPU_QUANTIZE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */
+class CpuQuantize : public ICpuOperator
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuQuantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
new file mode 100644
index 0000000000..a423abb49a
--- /dev/null
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuReshape.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::CpuReshapeKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::CpuReshapeKernel::validate(src, dst);
+}
+
+void CpuReshape::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    if (!_is_prepared)
+    {
+        static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
+        _is_prepared = true;
+    }
+    const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
new file mode 100644
index 0000000000..33da792319
--- /dev/null
+++ b/src/cpu/operators/CpuReshape.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_RESHAPE_H
+#define ARM_COMPUTE_CPU_RESHAPE_H
+
+#include "arm_compute/core/Window.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuReshapeKernel */
+class CpuReshape : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  src Source tensor info. Data type supported: All
+     * @param[out] dst Destination info. Data type supported: Same as @p src
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuReshape::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    bool _is_prepared{false};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_RESHAPE_H */
diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp
new file mode 100644
index 0000000000..7df9296931
--- /dev/null
+++ b/src/cpu/operators/CpuScale.cpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuScale.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/cpu/kernels/CpuScaleKernel.h"
+#include "support/Rounding.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+void precompute_dx_dy_offsets(
+    ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
+{
+    ARM_COMPUTE_ERROR_ON(offsets == nullptr);
+    float sampling_offset = 0.0f;
+    if (sampling_policy == SamplingPolicy::CENTER)
+    {
+        sampling_offset = 0.5f;
+    }
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
+    win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
+
+    if (dx != nullptr && dy != nullptr)
+    {
+        // Pre-compute the offset and pixel's distance for BILINEAR interpolation
+        Iterator offsets_it(offsets, win);
+        Iterator dx_it(dx, win);
+        Iterator dy_it(dy, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
+                const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
+                const int   in_xi = std::floor(in_x);
+                const int   in_yi = std::floor(in_y);
+
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+                *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
+                *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
+            },
+            offsets_it, dx_it, dy_it);
+    }
+    else
+    {
+        // Pre-compute the offset for NEAREST interpolation
+        Iterator offsets_it(offsets, win);
+
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float float_in_xi = (id.x() + sampling_offset) * wr;
+                const auto  in_xi       = static_cast<size_t>(
+                    align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi)
+                                         : std::floor(float_in_xi));
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+            },
+            offsets_it);
+    }
+}
+} // namespace
+
+void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info));
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
+    _scale_info  = info;
+    _is_prepared = false;
+
+    // Get data layout and width/height indices
+    _data_layout        = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
+    const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+    // Compute the ratio between source width/height and destination width/height
+    const bool is_align_corners_used =
+        _scale_info.align_corners &&
+        arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    InterpolationPolicy policy_to_use =
+        (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : _scale_info.interpolation_policy;
+
+    // Get the tensor shape
+    TensorShape shape(dst->dimension(idx_width));
+    shape.set(1, dst->dimension(idx_height), false);
+
+    TensorInfo tensor_info_offsets(shape, Format::S32);
+    TensorInfo tensor_info_dxdy(shape, Format::F32);
+
+    auto dx           = std::make_unique<TensorInfo>(tensor_info_dxdy);
+    auto dy           = std::make_unique<TensorInfo>(tensor_info_dxdy);
+    auto offsets      = std::make_unique<TensorInfo>(tensor_info_offsets);
+    auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
+    switch (policy_to_use)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        {
+            scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+    }
+    _kernel = std::move(scale_kernel);
+}
+
+Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
+
+    ITensorInfo *offsets = nullptr;
+    ITensorInfo *dx      = nullptr;
+    ITensorInfo *dy      = nullptr;
+
+    // Get data layout and width/height indices
+    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Compute the ratio between source width/height and destination width/height
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
+
+    // Get the tensor shape of auxilary buffers
+    const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
+    TensorInfo        tensor_info_offsets(shape, Format::S32);
+    TensorInfo        tensor_info_dx(shape, Format::F32);
+    TensorInfo        tensor_info_dy(shape, Format::F32);
+    switch (policy_to_use)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+            offsets = &tensor_info_offsets;
+            break;
+        case InterpolationPolicy::BILINEAR:
+            offsets = &tensor_info_offsets;
+            dx      = &tensor_info_dx;
+            dy      = &tensor_info_dy;
+            break;
+        default:
+            break;
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
+    return Status{};
+}
+
+void CpuScale::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        _is_prepared       = true;
+        const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
+        auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
+        auto       dx      = tensors.get_tensor(TensorType::ACL_INT_0);
+        auto       dy      = tensors.get_tensor(TensorType::ACL_INT_1);
+        auto       offsets = tensors.get_tensor(TensorType::ACL_INT_2);
+
+        // Get data layout and width/height indices
+        const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+        // Compute the ratio between source width/height and destination width/height
+        const bool is_align_corners_used =
+            _scale_info.align_corners &&
+            arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+        const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
+        const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
+
+        // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+        InterpolationPolicy policy_to_use =
+            (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+                ? InterpolationPolicy::NEAREST_NEIGHBOR
+                : _scale_info.interpolation_policy;
+        const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
+
+        bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+            _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
+
+        if (precompute_indices_weights)
+        {
+            switch (policy_to_use)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                {
+                    // Pre-compute offsets for nearest interpolation
+                    precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used);
+                    break;
+                }
+                case InterpolationPolicy::BILINEAR:
+                {
+                    // Pre-compute dx, dy and offsets for bilinear interpolation
+                    precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used);
+                    break;
+                }
+                case InterpolationPolicy::AREA:
+                {
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+            }
+        }
+        else
+        {
+            if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+            {
+                ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+            }
+        }
+    }
+}
+
+void CpuScale::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    prepare(tensors);
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h
new file mode 100644
index 0000000000..c12a8e733a
--- /dev/null
+++ b/src/cpu/operators/CpuScale.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_SCALE_H
+#define ARM_COMPUTE_CPU_SCALE_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to compute Scale */
+class CpuScale : public ICpuOperator
+{
+public:
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in, out] src  Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     dst  Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]      info @ref ScaleKernelInfo to be used for configuration
+     *
+     * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuScale::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
+
+    // Inherited methods overridden:
+    void prepare(ITensorPack &tensors) override;
+    void run(ITensorPack &tensors) override;
+
+private:
+    ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED};
+    DataLayout      _data_layout{DataLayout::UNKNOWN};
+    bool            _is_prepared{false};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_SCALE_H */
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
new file mode 100644
index 0000000000..fecee7d765
--- /dev/null
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuSoftmax.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
+#include "src/cpu/kernels/CpuSoftmaxKernel.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuSoftmaxGeneric::CpuSoftmaxGeneric() : _softmax_kernel(), _tmp(), _aux_mem(InternalTensorIdx::COUNT)
+{
+}
+
+void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log)
+{
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
+    ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
+
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+
+    _axis = actual_axis;
+
+    const ITensorInfo *tmp_input = src;
+
+    TensorInfo tensor_info_tmp;
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        // Create intermediate tensors shapes
+        const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+        tensor_info_tmp             = input_info.clone()->set_data_type(DataType::F32);
+    }
+
+    // Init intermediate tensors
+    _tmp = TensorInfo(tensor_info_tmp);
+
+    // Configure kernels
+    auto sm = std::make_unique<kernels::CpuSoftmaxKernel>();
+
+    // Softmax 2D case
+    sm->configure(tmp_input, dst, beta, is_log, actual_axis, &_tmp);
+
+    _softmax_kernel = std::move(sm);
+
+    if (_tmp.total_size() > 0)
+    {
+        _aux_mem[InternalTensorIdx::TMP] =
+            MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    }
+}
+
+Status
+CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log)
+{
+    // Perform validation step
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= axis);
+
+    // Create intermediate tensor info
+    TensorInfo tensor_info_tmp;
+
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true);
+    }
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::CpuSoftmaxKernel::validate(src, dst, beta, actual_axis, is_log, &tensor_info_tmp));
+
+    return Status{};
+}
+
+void CpuSoftmaxGeneric::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true);
+
+    ITensorPack softmax_pack;
+
+    softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}};
+
+    if (_axis == 0)
+    {
+        NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
+    }
+    else
+    {
+        NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimX, _softmax_kernel->window(), softmax_pack);
+    }
+}
+
+experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
new file mode 100644
index 0000000000..6ba3476eff
--- /dev/null
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
+#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuSoftmaxKernel;
+
+/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
+ *
+ * Softmax is calculated by :
+ * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
+ *
+ * Log Softmax is calculated by :
+ * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
+ *
+ * This function runs the following function/kernels:
+ * -# If axis is not 0:
+ * -# @ref CpuPermute
+ * -# @ref kernels::CpuSoftmaxKernel
+ */
+class CpuSoftmaxGeneric : public ICpuOperator
+{
+public:
+    CpuSoftmaxGeneric();
+    /** Set the input and output tensors.
+     *
+     * @param[in,out] src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     *                       last value of each row to the nearest multiple.
+     * @param[out]    dst    Destination tensor ifo. Data types supported: same as @p input.
+     * @param[in]     beta   (Optional) A scaling factor for the exponent.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+     * @param[in]     is_log True if the operation is log-softmax
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuSoftmaxGeneric::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum InternalTensorIdx
+    {
+        TMP = 0,
+        PERMUTED_SRC,
+        PERMUTED_DST,
+        COUNT
+    };
+
+    std::unique_ptr<ICPPKernel> _softmax_kernel;
+
+    TensorInfo _tmp;
+
+    experimental::MemoryRequirements _aux_mem{};
+
+    unsigned int _axis = 0;
+};
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
new file mode 100644
index 0000000000..7d27efbc96
--- /dev/null
+++ b/src/cpu/operators/CpuSub.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuSub.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuSubKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuSub::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy);
+    auto k = std::make_unique<kernels::CpuSubKernel>();
+    k->configure(src0, src1, dst, policy);
+    _kernel = std::move(k);
+}
+
+Status CpuSub::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
+}
+
+void CpuSub::run(ITensorPack &tensors)
+{
+    const auto split_dimension = static_cast<kernels::CpuSubKernel *>(_kernel.get())->get_split_dimension();
+
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
new file mode 100644
index 0000000000..d1782a1d3c
--- /dev/null
+++ b/src/cpu/operators/CpuSub.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_SUB_H
+#define ARM_COMPUTE_CPU_SUB_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuSubKernel */
+class CpuSub : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, dst and conversion policy.
+     *
+     * Valid configurations (src0,src1) -> dst :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
+     *
+     * @param[in]  src0     First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  src1     Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] dst      Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuSub::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_SUB_H */
diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp
new file mode 100644
index 0000000000..ea548e0511
--- /dev/null
+++ b/src/cpu/operators/CpuTranspose.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuTranspose.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::CpuTransposeKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::CpuTransposeKernel::validate(src, dst);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuTranspose.h b/src/cpu/operators/CpuTranspose.h
new file mode 100644
index 0000000000..8934481ef6
--- /dev/null
+++ b/src/cpu/operators/CpuTranspose.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H
+#define ARM_COMPUTE_CPU_TRANSPOSE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuTransposeKernel */
+class CpuTranspose : public ICpuOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  src Source tensor to permute. Data types supported: All
+     * @param[out] dst Destintation tensor. Data types supported: Same as @p src
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuTranspose::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
new file mode 100644
index 0000000000..7d81aee0e9
--- /dev/null
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/assembly/winograd.hpp"
+#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/core/utils/AssemblyUtils.h"
+#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuPermute.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+
+namespace
+{
+inline Tensor4DShape internal_get_shape(const ITensorInfo *in)
+{
+    const DataLayout data_layout = in->data_layout();
+    const int        in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+    const int in_height   = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+    const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const int in_batches  = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
+
+    return Tensor4DShape{in_batches, in_height, in_width, in_channels};
+}
+
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_UNUSED(dst, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd layer only supports unit strides.");
+    if (biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    return Status{};
+}
+
+bool get_winograd_kernel_implementation(const ITensorInfo                          *src,
+                                        const ITensorInfo                          *weights,
+                                        const ITensorInfo                          *dst,
+                                        const PadStrideInfo                        &conv_info,
+                                        const ActivationLayerInfo                  &act_info,
+                                        bool                                        enable_fast_math,
+                                        arm_conv::winograd::WinogradImpl           *winograd_impl,
+                                        std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
+{
+    arm_conv::winograd::WinogradConfig winograd_cfg;
+    arm_gemm::GemmConfig               cfg;
+
+    const DataType data_type = src->data_type();
+    Tensor4DShape  in_shape{internal_get_shape(src)};
+    Tensor4DShape  out_shape{internal_get_shape(dst)};
+    Tensor4DShape  kernel_shape{internal_get_shape(weights)};
+    uint32_t       nthreads = NEScheduler::get().num_threads();
+    // Get configuration arguments for Winograd
+    winograd_cfg.output_rows = 0;
+    winograd_cfg.output_cols = 0;
+    conv_args                = std::make_unique<arm_conv::ConvolutionArgs>(
+        in_shape.n_batches,
+        arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)},
+        in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(),
+        arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)},
+        out_shape.n_channels,
+        arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)},
+        assembly_utils::map_to_arm_gemm_activation(act_info));
+
+    bool success = false;
+    if (data_type == DataType::F32)
+    {
+        success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                enable_fast_math, &winograd_cfg, nullptr);
+    }
+#if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS)
+    else if (data_type == DataType::F16)
+    {
+        success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                 enable_fast_math, &winograd_cfg, nullptr);
+    }
+#endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS)
+    else
+    {
+        success = false;
+    }
+    return success;
+}
+inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
+{
+    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+           act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+}
+} // namespace
+
+CpuWinogradConv2d::CpuWinogradConv2d()
+
+    : _gemm_function(std::make_unique<CpuGemm>()),
+      _activation_func(std::make_unique<CpuActivation>()),
+      _transform_input_kernel(nullptr),
+      _transform_output_kernel(nullptr),
+      _permute_input(std::make_unique<CpuPermute>()),
+      _permute_output(std::make_unique<CpuPermute>()),
+      _permute_weights(std::make_unique<CpuPermute>()),
+      _aux_mem(AuxTensorIdx::Count),
+      _conv_args{nullptr},
+      _winograd_impl{},
+      _data_layout(),
+      _winograd_transformed_input{},
+      _winograd_transformed_output{},
+      _winograd_transformed_weights{},
+      _input_workspace(),
+      _output_workspace(),
+      _weights_hwio(),
+      _input_nhwc(),
+      _output_nhwc(),
+      _is_prepared{false},
+      _run_activation{false}
+{
+}
+
+CpuWinogradConv2d::~CpuWinogradConv2d() = default;
+
+void CpuWinogradConv2d::configure(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  ITensorInfo               *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
+    ARM_COMPUTE_UNUSED(biases);
+    const DataType data_type = src->data_type();
+    uint32_t       nthreads  = NEScheduler::get().num_threads();
+    _data_layout             = src->data_layout();
+    const Tensor4DShape kernel_shape{internal_get_shape(weights)};
+
+    bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                      &_winograd_impl, _conv_args);
+
+    ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+
+    const bool has_impl = ((_winograd_impl.input_transform != nullptr) &&
+                           (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
+    if (has_impl)
+    {
+        // Determine how much working space is required, allocate it.
+        const size_t input_workspace_size =
+            _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
+        const size_t output_workspace_size =
+            _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
+
+        TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
+        TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
+        _input_workspace  = input_workspace_info;
+        _output_workspace = output_workspace_info;
+
+        const auto &wds = _winograd_impl.winograd_spec;
+
+        // Preparing winograd transformed input tensor
+        const size_t     data_type_size    = src->element_size();
+        const uint32_t   m                 = _winograd_impl.gemm_args->_Msize; // Total number of tiles
+        const uint32_t   k                 = _winograd_impl.gemm_args->_Ksize; // Input channels
+        const uint32_t   n                 = _winograd_impl.gemm_args->_Nsize; // Output channels
+        const uint32_t   n_gemms           = _winograd_impl.gemm_args->_nmulti;
+        const uint32_t   n_batches         = _winograd_impl.gemm_args->_nbatches;
+        constexpr size_t storage_alignment = 64;
+
+        const TensorShape a_shape(k, m, n_batches, n_gemms);
+        Strides           a_strides(data_type_size);
+        a_strides.set(1, data_type_size * _winograd_impl.winograd_spec.input_ld_row);
+        a_strides.set(2, data_type_size * _winograd_impl.winograd_spec.input_ld_batch);
+        a_strides.set(3, data_type_size * _winograd_impl.winograd_spec.input_ld_matrix);
+
+        const TensorShape b_shape(n, k, n_gemms);
+        Strides           b_strides(data_type_size);
+        b_strides.set(1, data_type_size * _winograd_impl.winograd_spec.weight_ld_row);
+        b_strides.set(2, data_type_size * _winograd_impl.winograd_spec.weight_ld_matrix);
+
+        const TensorShape d_shape(n, m, n_batches, n_gemms);
+        Strides           d_strides(data_type_size);
+        d_strides.set(1, data_type_size * _winograd_impl.winograd_spec.output_ld_row);
+        d_strides.set(2, data_type_size * _winograd_impl.winograd_spec.output_ld_batch);
+        d_strides.set(3, data_type_size * _winograd_impl.winograd_spec.output_ld_matrix);
+
+        TensorInfo a_info{};
+        TensorInfo b_info{};
+        TensorInfo d_info{};
+        a_info.init(a_shape, 1, data_type, a_strides, 0, wds.input_matrix_size_bytes);
+        b_info.init(b_shape, 1, data_type, b_strides, 0, wds.weight_matrix_size_bytes);
+        d_info.init(d_shape, 1, data_type, d_strides, 0, wds.output_matrix_size_bytes);
+
+        _winograd_transformed_input   = a_info;
+        _winograd_transformed_weights = b_info;
+        _winograd_transformed_output  = d_info;
+
+        PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
+
+        // Configure the kernel to transform the input tensor from NCHW -> NHWC
+        if (_data_layout == DataLayout::NCHW)
+        {
+            _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+            weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
+        }
+
+        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+        _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
+
+        // Reorder the convoluted output to ACL's ordering NCHW
+        if (_data_layout == DataLayout::NCHW)
+        {
+            // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+            TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1,
+                            dst->data_type());
+            _output_nhwc = info;
+            _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
+        }
+
+        // Configure input transform kernel
+        _transform_input_kernel =
+            std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
+
+        // Configure GEMM function
+        _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr,
+                                  &_winograd_transformed_output, 1.0f, 0.f);
+
+        // Configure output transform kernel
+        _transform_output_kernel =
+            std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
+
+        //Configure Activation Layer
+        _run_activation = act_info.enabled() && !fuse_function_supported(act_info);
+        if (_run_activation)
+        {
+            _activation_func->configure(dst, nullptr, act_info);
+        }
+
+        const auto mm_mem_req = _gemm_function->workspace();
+        for (unsigned int slot = 0; slot < mm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = mm_mem_req[slot];
+        }
+
+        // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
+        _aux_mem[TransformedInput]  = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary,
+                                                 wds.input_matrix_size_bytes, storage_alignment);
+        _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary,
+                                                 wds.output_matrix_size_bytes, storage_alignment);
+        _aux_mem[WorkspaceIO]       = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary,
+                                                 std::max(input_workspace_size, output_workspace_size));
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
+        _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent,
+                                                  wds.weight_matrix_size_bytes, storage_alignment);
+        if (_data_layout == DataLayout::NCHW)
+        {
+            _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
+            _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
+        }
+    }
+}
+Status CpuWinogradConv2d::validate(const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
+
+    // Disable winograd for fp16 if fast math is false.
+    if (!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
+    }
+
+    const Tensor4DShape              kernel_shape{internal_get_shape(weights)};
+    arm_conv::winograd::WinogradImpl winograd_impl{};
+
+    std::unique_ptr<arm_conv::ConvolutionArgs> conv_args;
+    const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                            &winograd_impl, conv_args);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                        kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    return Status{};
+}
+
+void CpuWinogradConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+    auto   src    = tensors.get_const_tensor(ACL_SRC_0);
+    auto   biases = tensors.get_const_tensor(ACL_SRC_2);
+    auto   output = tensors.get_tensor(ACL_DST);
+    Window win;
+
+    const uint32_t nthreads = NEScheduler::get().num_threads();
+
+    // The Winograd transform implementation does fine-grain threading inside the transforms. Just pass thread_id and nthreads.
+    win.set(Window::DimX, Window::Dimension(0, nthreads, 1));
+
+    // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory.
+    CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
+    CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input,
+                                                   tensors, true);
+    CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
+    const bool          is_nchw = _data_layout == DataLayout::NCHW;
+    if (is_nchw)
+    {
+        //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+        ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}};
+        _permute_input->run(pack);
+    }
+
+    CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output,
+                                                    tensors, true);
+    CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
+    CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
+
+    ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src},
+                                     {ACL_DST, winograd_input_transformed.get()},
+                                     {ACL_INT, input_workspace.get()}};
+    NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack);
+
+    CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights,
+                                                     tensors, true);
+
+    // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+    ITensorPack gemm_pack = tensors;
+    gemm_pack.add_const_tensor(ACL_SRC, winograd_input_transformed.get());
+    gemm_pack.add_const_tensor(ACL_SRC_1, winograd_weights_transformed.get());
+    gemm_pack.add_const_tensor(ACL_BIAS, nullptr);
+    gemm_pack.add_tensor(ACL_DST, winograd_output_transformed.get());
+    _gemm_function->run(gemm_pack);
+
+    // Output transform
+    ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()},
+                                      {ACL_DST, is_nchw ? output_nhwc.get() : output},
+                                      {ACL_SRC_1, biases},
+                                      {ACL_INT, output_workspace.get()}};
+    NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack);
+    if (is_nchw)
+    {
+        // Reorder the convoluted output to ACL's ordering NCHW
+        ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}};
+        _permute_output->run(pack);
+    }
+    if (_run_activation)
+    {
+        ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}};
+        _activation_func->run(pack);
+    }
+}
+
+void CpuWinogradConv2d::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+
+        CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
+        _permute_weights->run(permute_tensors);
+        const int element_size_in_bytes = permuted_weights.get()->info()->element_size();
+        // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format.
+        const unsigned int height_idx  = 3; // H in HWIO
+        const unsigned int width_idx   = 2; // W in HWIO
+        const unsigned int channel_idx = 1; // I in HWIO
+
+        const int permuted_weight_row_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
+        const int permuted_weight_col_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
+        const int permuted_weight_channel_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
+
+        // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory.
+        ITensor *weights_transf =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
+        ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
+        CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf);
+
+        const void *permuted_weights_ptr;
+        void       *win_wght_transf_ptr;
+
+        permuted_weights_ptr = reinterpret_cast<const void *>(
+            permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
+        win_wght_transf_ptr =
+            reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() +
+                                     winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
+
+        // Prepare Weights
+        _winograd_impl.weight_transform->execute(
+            *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride,
+            permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1
+        );
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get());
+        _gemm_function->prepare(gemm_pack);
+        _is_prepared = 1;
+    }
+}
+experimental::MemoryRequirements CpuWinogradConv2d::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
new file mode 100644
index 0000000000..03bfc51a46
--- /dev/null
+++ b/src/cpu/operators/CpuWinogradConv2d.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/assembly/gemm_common.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuPermute.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuWinogradConv2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuWinogradConv2d();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWinogradConv2d);
+    /** Destructor */
+    ~CpuWinogradConv2d();
+
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     *
+     * @param[in]  src              Source tensor Info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F16/F32.
+     * @param[in]  weights          Weights tensor Info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     *                              For supported kernel sizes, see @ref arm_compute::NEWinogradConvolutionLayer
+     * @param[in]  biases           Biases tensor Info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] dst              Destination tensor Info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     */
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
+     *
+     * Similar to CpuWinogradConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        /** Slot 0 - 6 reserved for CpuGemm */
+        TransformedInput = 7,
+        TransformedOutput,
+        WorkspaceIO,
+        TransformedWeights,
+        PermutedWeights,
+        Count,
+        PermutedInput  = TransformedOutput,
+        PermutedOutput = TransformedInput
+    };
+    std::unique_ptr<CpuGemm>         _gemm_function;
+    std::unique_ptr<CpuActivation>   _activation_func;
+    std::unique_ptr<ICPPKernel>      _transform_input_kernel;
+    std::unique_ptr<ICPPKernel>      _transform_output_kernel;
+    std::unique_ptr<CpuPermute>      _permute_input;
+    std::unique_ptr<CpuPermute>      _permute_output;
+    std::unique_ptr<CpuPermute>      _permute_weights;
+    experimental::MemoryRequirements _aux_mem{Count};
+    std::unique_ptr<arm_conv::ConvolutionArgs>
+        _conv_args; // Make it unique ptr because this type does not have a default constructor
+    arm_conv::winograd::WinogradImpl _winograd_impl;
+    DataLayout                       _data_layout;
+    TensorInfo                       _winograd_transformed_input;
+    TensorInfo                       _winograd_transformed_output;
+    TensorInfo                       _winograd_transformed_weights;
+    TensorInfo                       _input_workspace;
+    TensorInfo                       _output_workspace;
+    TensorInfo                       _weights_hwio;
+    TensorInfo                       _input_nhwc;
+    TensorInfo                       _output_nhwc;
+    bool                             _is_prepared;
+    bool                             _run_activation;
+};
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
new file mode 100644
index 0000000000..a4c856bb8f
--- /dev/null
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -0,0 +1,1140 @@
+/*
+ * Copyright (c) 2018-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/utils/AssemblyUtils.h"
+#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
+#include "src/cpu/operators/CpuTranspose.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+/** Run pretranspose_B_array in parallel (1D static scheduling)
+ *
+ * @tparam TypeInput
+ * @tparam TypeOutput
+ *
+ * @param[in] gemm_asm         GemmCommon kernel to run
+ * @param[in] dst              Pretransposed B array
+ * @param[in] src              B array to be pretransposed
+ * @param[in] src_ld           Stride in y
+ * @param[in] src_multi_stride Stride in z ("multi")
+ * @param[in] num_threads      Number of threads to run this method. Must be >= 1
+ */
+template <typename TypeInput, typename TypeOutput>
+void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm,
+                                       ITensor                                     *dst,
+                                       const TypeInput                             *src,
+                                       int                                          src_ld,
+                                       int                                          src_multi_stride,
+                                       unsigned int                                 num_threads,
+                                       bool                                         transpose)
+{
+    ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_threads == 0);
+    // The window size is also the total workload size
+    const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
+
+    std::vector<IScheduler::Workload> workloads(num_threads);
+    for (unsigned int t = 0; t < num_threads; ++t)
+    {
+        workloads[t] = [=](const ThreadInfo &info)
+        {
+            const unsigned int start = (info.thread_id * wsize) / num_threads;
+            const unsigned int end   = ((info.thread_id + 1) * wsize) / num_threads;
+
+            if (start < end)
+            {
+                gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, transpose, start,
+                                                    end);
+            }
+        };
+    }
+    NEScheduler::get().run_tagged_workloads(workloads, "CpuGemmAssemblyDispatch/pretranspose_B_array");
+}
+} // namespace
+
+using namespace arm_compute::experimental;
+
+namespace
+{
+struct free_delete
+{
+    void operator()(void *x)
+    {
+        free(x);
+    }
+};
+
+struct Params
+{
+    unsigned int M;
+    unsigned int N;
+    unsigned int K;
+    unsigned int batches;
+    unsigned int multis;
+    unsigned int sections;
+    bool         indirect;
+};
+
+Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+    Params p;
+    p.M        = d->tensor_shape().y();
+    p.K        = a->tensor_shape().x();
+    p.N        = d->tensor_shape().x();
+    p.batches  = 1;
+    p.multis   = 1;
+    p.sections = 1;
+    p.indirect = false;
+
+    if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+    {
+        p.indirect = true;
+        p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
+    }
+    else
+    {
+        p.multis  = b->tensor_shape().z();
+        p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
+    }
+
+    // Update M in case of GEMM3D for output
+    if (info.depth_output_gemm3d != 0)
+    {
+        p.M       = d->tensor_shape().y() * d->tensor_shape().z();
+        p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
+    }
+
+    return p;
+}
+
+IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
+{
+    // Schedule assembly kernel
+    const int         granule_threshold = 200;
+    IScheduler::Hints scheduling_hint   = IScheduler::Hints(Window::DimX);
+    if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+    {
+        scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+    }
+    else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D &&
+             (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 ||
+              data_type == DataType::S8))
+    {
+        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+    }
+    else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D &&
+             (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+    {
+        //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+    }
+
+    return scheduling_hint;
+}
+
+/** Fallback in case ACL doesn't have a function */
+template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
+class Fallback : public CpuGemmAssemblyDispatch::IFallback
+{
+public:
+    /** Destructor */
+    ~Fallback() = default;
+
+    /** Initialise the functions's input and output.
+     *
+     * @param[in]  a         Input tensor containing the Matrix A.
+     * @param[in]  b         Input tensor containing the Matrix B.
+     * @param[in]  c         Input tensor containing the Matrix C.
+     * @param[out] d         Output tensor to store the result of matrix multiplication.
+     * @param[in]  args      Matrix multiplication information.
+     * @param[in]  gemm_info GEMM meta-data
+     * @param[in]  os        Output stage meta-data.
+     */
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   arm_gemm::GemmArgs args,
+                   const AsmGemmInfo &gemm_info,
+                   const OutputStage &os = {});
+
+    /** Set requantization shifts to be used
+     *
+     * @param[in] shifts Requantization shifts
+     *
+     * @return Pointer to the shift data
+     */
+    /** Set requantization data to be used
+      *
+      *
+      * @param shifts       Requantization shifts
+      * @param multipliers  Requantization multipliers
+      *
+      * @return A tuple with the pointers to the shift and multiplier data respectively
+      */
+    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+    set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    bool                             is_configured() const override;
+    experimental::MemoryRequirements workspace() const override;
+    bool                             isVarWeightsKernel() const override
+    {
+        if (!_gemm_kernel_asm)
+            return false;
+        const arm_compute::WeightFormat wf =
+            assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
+        return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY;
+    }
+
+private:
+    enum AuxTensorIdx
+    {
+        AsmGemmWorkspace = 0,
+        PrePretransposedB, /* Transposed B (rhs) before being passed to gemm or pretranspose_B_array */
+        Pretranspose,
+        Count
+    };
+
+    /** Configure the indirect buffer
+     *
+     * @param[in]  a    Input tensor containing the Matrix A.
+     * @param[in]  b    Input tensor containing the Matrix B.
+     * @param[out] d    Output tensor to store the result of matrix multiplication.
+     * @param[in]  info GEMM meta-data
+     */
+    void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
+    /** Prepare the indirect buffer */
+    void prepare_indirect_buffer(ITensorPack &tensors);
+
+    /** Operator to transpose B before gemm or pretranspose_B_array*/
+    std::unique_ptr<CpuTranspose> _pre_pretranspose_b{nullptr};
+    /** Assembly Gemm kernel */
+    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr};
+    /** Optimised Arm® Neon™ kernel */
+    std::unique_ptr<INEKernel> _optimised_kernel{nullptr};
+    /** Assembly GEMM workspace tensor info */
+    TensorInfo _workspace_info{};
+    /** Pre-pre-transposed B tensor info */
+    TensorInfo _pre_pretransposed_b_info{};
+    /** Pre-transpose tensor info */
+    TensorInfo _pretranspose_info{};
+    /** Prepared flag */
+    bool _is_prepared{false};
+    /** GEMM meta-data */
+    AsmGemmInfo _gemm_info{};
+    /** GEMM kernel description */
+    arm_gemm::KernelDescription _kernel_info{};
+    /** Per channel quantization shifts */
+    std::vector<int32_t> _shifts{};
+    std::vector<int32_t> right_shifts{};
+    std::vector<int32_t> left_shifts{};
+    /** Per channel quantization multipliers */
+    std::vector<int32_t> _multipliers{};
+    /** Indirect buffer */
+    std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
+    std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
+    std::vector<TypeInput>                                 _indirect_pad{};
+    arm_gemm::ConvolutionParameters                        _cp{};
+    experimental::MemoryRequirements                       _aux_mem{Count};
+    bool                                                   _B_pretranspose_required{false};
+    bool                                                   _is_b_constant{true};
+    bool                                                   _is_c_constant{true};
+    bool                                                   _run_pre_pretranspose_b{false};
+    bool                                                   _B_pre_pretranspose_required{false};
+};
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
+                                                                  const std::vector<int32_t> &multipliers)
+{
+    _multipliers   = multipliers;
+    _shifts        = shifts;
+    bool need_left = false;
+    for (const auto s : _shifts)
+    {
+        left_shifts.push_back(std::max(-s, int32_t(0)));
+        right_shifts.push_back(std::min(-s, int32_t(0)));
+        if (s < 0 && !need_left)
+        {
+            need_left = true;
+        }
+    }
+    return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
+{
+    auto             a              = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const TypeInput *A_ptr          = reinterpret_cast<TypeInput *>(a->buffer());
+    const int        multis         = 1;
+    const int        batches        = a->info()->tensor_shape().total_size_upper(3);
+    const size_t     stride_A       = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    const size_t     batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+    const size_t     multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
+
+    const size_t output_hw    = _cp.output_height * _cp.output_width;
+    const int    batch_size   = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
+    const size_t batch_stride = batch_size / sizeof(TypeInput);
+    const int    multi_size   = batch_size * batches;
+    const size_t multi_stride = multi_size / sizeof(TypeInput);
+
+    for (int64_t m = 0; m < multis; m++)
+    {
+        for (int64_t b = 0; b < batches; b++)
+        {
+            for (int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+            {
+                for (int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+                {
+                    int64_t output_xy = (output_y * _cp.output_width) + output_x;
+
+                    for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+                    {
+                        for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+                        {
+                            int64_t input_x   = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
+                            int64_t input_y   = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
+                            int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
+                            int64_t input_xy  = (input_y * _cp.input_width) + input_x;
+
+                            if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+                            {
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                    _indirect_pad.data();
+                            }
+                            else
+                            {
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                    A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a,
+                                                                      const ITensorInfo *b,
+                                                                      const ITensorInfo *d,
+                                                                      const AsmGemmInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
+
+    float zeropad = 0.f;
+    if (is_data_type_quantized(a->data_type()))
+    {
+        zeropad = a->quantization_info().uniform().offset;
+    }
+
+    const int64_t input_width    = static_cast<int64_t>(a->tensor_shape()[1]);
+    const int64_t input_height   = static_cast<int64_t>(a->tensor_shape()[2]);
+    const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
+    const int64_t kernel_width   = static_cast<int64_t>(b->tensor_shape()[2]);
+    const int64_t kernel_height  = static_cast<int64_t>(b->tensor_shape()[3]);
+    const int64_t output_width   = static_cast<int64_t>(d->tensor_shape()[1]);
+    const int64_t output_height  = static_cast<int64_t>(d->tensor_shape()[2]);
+
+    _cp = {input_width,
+           input_height,
+           input_channels,
+           kernel_width,
+           kernel_height,
+           output_width,
+           output_height,
+           info.ps_info.stride().first,
+           info.ps_info.stride().second,
+           info.padding_top,
+           info.padding_left,
+           zeropad};
+
+    if (info.method == AsmConvMethod::Conv)
+    {
+        _gemm_kernel_asm->set_convolution_parameters(_cp);
+    }
+
+    if (info.method == AsmConvMethod::Indirect)
+    {
+        const unsigned int multis    = 1;
+        const unsigned int batches   = a->tensor_shape().total_size_upper(3);
+        const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
+        const unsigned int output_hw = _cp.output_width * _cp.output_height;
+
+        using TypeInputPtr        = TypeInput *;
+        const int    batch_size   = kernel_hw * output_hw * sizeof(TypeInputPtr);
+        const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
+        const int    multi_size   = batch_size * batches;
+        const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
+
+        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
+            reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
+            reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+        _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
+
+        // Set indirect argument
+        int64_t pos = 0;
+        for (int64_t m = 0; m < multis; m++)
+        {
+            for (int64_t b = 0; b < batches; b++)
+            {
+                for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+                {
+                    (_indirect_arg.get())[pos++] =
+                        _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+                }
+            }
+        }
+
+        _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
+    }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a,
+                                                             const ITensorInfo *b,
+                                                             const ITensorInfo *c,
+                                                             ITensorInfo       *d,
+                                                             arm_gemm::GemmArgs args,
+                                                             const AsmGemmInfo &gemm_info,
+                                                             const OutputStage &os)
+{
+    _is_b_constant = b->are_values_constant();
+    _is_c_constant = c ? c->are_values_constant() : true;
+
+    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
+    if (_gemm_kernel_asm == nullptr)
+    {
+        //configuration not supported: Leave function unconfigured:
+        return;
+    }
+
+    arm_gemm::GemmConfig gemm_cfg = _gemm_kernel_asm->get_config();
+
+    // arm_compute wrapper for the Gemm object (see above)
+    auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
+    ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
+    acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
+    const size_t       workspace_size = _gemm_kernel_asm->get_working_size();
+    const unsigned int alignment      = 4096;
+    _workspace_info                   = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
+    _aux_mem[AsmGemmWorkspace] =
+        MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
+
+    //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
+    //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
+    {
+        const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
+        if (window_size < static_cast<unsigned int>(args._maxthreads))
+        {
+            _gemm_kernel_asm->set_nthreads(window_size);
+        }
+    }
+
+    _optimised_kernel = std::move(acl_gemm_wrapper);
+    _gemm_info        = gemm_info;
+
+    // Check if we need to pre-pretranspose B. Fixed format kernels need no pre-pretranspose.
+    _B_pre_pretranspose_required = _gemm_info.transpose_b && !isVarWeightsKernel();
+    _B_pretranspose_required     = _gemm_kernel_asm->B_pretranspose_required();
+
+    const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose();
+    const bool kernel_can_fuse_transpose = _B_pretranspose_required && kernel_supports_transpose;
+    _run_pre_pretranspose_b              = _B_pre_pretranspose_required && !kernel_can_fuse_transpose;
+
+    if (_run_pre_pretranspose_b)
+    {
+        _pre_pretranspose_b = std::make_unique<CpuTranspose>();
+        _pre_pretranspose_b->configure(b, &_pre_pretransposed_b_info);
+        MemoryLifetime lifetime;
+        if (_is_b_constant)
+        {
+            if (_B_pretranspose_required)
+            {
+                // PrePretransposedB tensor is only used in prepare(), but is then succeeded by Pretranspose
+                // So PrePretransposedB can be freed inside prepare()
+                lifetime = MemoryLifetime::Prepare;
+            }
+            else
+            {
+                // PrePretransposedB tensor is only used in prepare(), but is the final transformation of B
+                // So PrePretransposedB needs to persist beyond prepare()
+                lifetime = MemoryLifetime::Persistent;
+            }
+        }
+        else
+        {
+            // PrePretransposedB tensor is always used in run() and doesn't need to persist
+            lifetime = MemoryLifetime::Temporary;
+        }
+        // Forcing 128-byte alignment (required by 32-bit kernels)
+        const unsigned int alignment = 128;
+        _aux_mem[PrePretransposedB] =
+            MemoryInfo(offset_int_vec(PrePretransposedB), lifetime, _pre_pretransposed_b_info.total_size(), alignment);
+    }
+
+    // Check for pre-transposed support
+    if (_B_pretranspose_required)
+    {
+        // Fixed format kernels need no pretranspose.
+        ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+            assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+        // Forcing 128-byte alignment (required by 32-bit kernels)
+        const unsigned int alignment           = 128;
+        const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
+        _pretranspose_info                     = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
+        _aux_mem[Pretranspose] =
+            MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+    }
+
+    // Handle indirect GEMM convolution
+    if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+    {
+        configure_indirect(a, b, d, gemm_info);
+    }
+
+    if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value)
+    {
+        // Output dequantization is just the two src scales multiplied together
+        _gemm_kernel_asm->set_dequantize_scale(a->quantization_info().uniform().scale *
+                                               b->quantization_info().uniform().scale);
+    }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+        ARM_COMPUTE_ERROR_ON_NULLPTR(b);
+
+        // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+        if (c && c->info()->data_type() == DataType::S32)
+        {
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+        }
+        const ITensor *b_to_use = b;
+
+        // Pre-pretranspose B if required
+        CpuAuxTensorHandler pre_pretransposed_b(
+            offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors,
+            /*pack_inject: no need to inject into tensors*/
+            false,
+            /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/
+            !_run_pre_pretranspose_b);
+
+        if (_run_pre_pretranspose_b)
+        {
+            ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr);
+            ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}};
+            _pre_pretranspose_b->run(pre_pretranspose_pack);
+            b_to_use = pre_pretransposed_b.get();
+        }
+
+        // Pretranspose B if required
+        if (_B_pretranspose_required)
+        {
+            // Fixed format kernels need no pretranspose.
+            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+                assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+            const int  ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b_to_use->buffer() +
+                                                                     b_to_use->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
+
+            CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
+
+            ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
+
+            const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose();
+            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(
+                _gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b,
+                NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose);
+
+            b->mark_as_unused();
+            // Note that we don't need to mark b_to_use as unused, as if it's been assigned to pre_pretransposed_b,
+            // its memory will be auto-managed by the handler
+        }
+
+        if (_gemm_info.method == AsmConvMethod::Indirect)
+        {
+            prepare_indirect_buffer(tensors);
+        }
+
+        _is_prepared = true;
+    }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
+{
+    return _optimised_kernel != nullptr;
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const
+{
+    return _aux_mem;
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
+{
+    auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto d = tensors.get_tensor(TensorType::ACL_DST);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, d);
+
+    // Only update at runtime if the src quantization is dynamic
+    if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value &&
+        (a->info()->quantization_info().is_dynamic() || b->info()->quantization_info().is_dynamic()))
+    {
+        // Output dequantization is just the two src scales multiplied together
+        _gemm_kernel_asm->set_dequantize_scale(a->info()->quantization_info().uniform().scale *
+                                               b->info()->quantization_info().uniform().scale);
+    }
+
+    int       lda = a->info()->strides_in_bytes().y() / a->info()->element_size();
+    int       ldb = 0;
+    const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size();
+
+    const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
+    const size_t a_multi_idx = a_batch_idx + 1;
+    const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
+    const size_t d_multi_idx = d_batch_idx + 1;
+
+    int       batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / a->info()->element_size();
+    const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / d->info()->element_size();
+
+    int       multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / a->info()->element_size();
+    int       multi_stride_b = 0;
+    const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
+
+    auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
+    const TypeInput *in1_ptr = nullptr;
+    auto             out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
+
+    const ITensor *b_to_use = b;
+
+    // Pre-pretranspose B if required
+    CpuAuxTensorHandler pre_pretransposed_b(
+        offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors,
+        false /*pack_inject: no need to inject into tensors*/,
+        !_run_pre_pretranspose_b /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/);
+    if (b_to_use && !_is_b_constant && _run_pre_pretranspose_b)
+    {
+        ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr);
+        ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}};
+        _pre_pretranspose_b->run(pre_pretranspose_pack);
+        b_to_use = pre_pretransposed_b.get();
+    }
+
+    // Check if B is pre-tranposed and de-reference if not
+    if (b_to_use && !_gemm_kernel_asm->B_is_pretransposed())
+    {
+        ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+        multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
+        in1_ptr =
+            reinterpret_cast<const TypeInput *>(b_to_use->buffer() + b_to_use->info()->offset_first_element_in_bytes());
+    }
+
+    // If necessary, run pretranspose every time if either weights or biases are non-constant
+    if ((b_to_use && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
+    {
+        if (c && c->info()->data_type() == DataType::S32)
+        {
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+        }
+
+        // Pretranspose B if required
+        if (b_to_use && _B_pretranspose_required)
+        {
+            // Fixed format kernels need no pretranspose.
+            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+                assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+            const int  ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+            const auto b_ptr          = reinterpret_cast<const TypeInput *>(b_to_use->buffer() +
+                                                                   b_to_use->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
+
+            CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
+            ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
+
+            if (_is_b_constant)
+            {
+                _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
+            }
+            else
+            {
+                const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose();
+                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(
+                    _gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b,
+                    NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose);
+            }
+        }
+    }
+
+    const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
+
+    // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
+    CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
+    if (workspace.get()->buffer() != nullptr)
+    {
+        _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
+        const unsigned int split_dim   = scheduling_hint.split_dimension();
+        const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
+        unsigned int       num_threads = NEScheduler::get().num_threads();
+        if (window_size < num_threads)
+        {
+            num_threads = window_size;
+        }
+        if (split_dim != IScheduler::split_dimensions_all)
+        {
+            // Make sure the kernel does not expect more threads than we can actually spawn
+            const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
+            num_threads                       = std::min(num_iterations, num_threads);
+        }
+        _gemm_kernel_asm->set_nthreads(num_threads);
+    }
+
+    // Prepare assembly kernel
+    prepare(tensors);
+
+    // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+    TypeOutput *bias = nullptr;
+    if (c && c->info()->data_type() != DataType::S32)
+    {
+        bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
+    }
+
+    if (_gemm_info.method == AsmConvMethod::Indirect)
+    {
+        in0_ptr        = nullptr;
+        lda            = 0;
+        batch_stride_a = 0;
+        multi_stride_a = 0;
+    }
+
+    // Set gemm parameters
+    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr,
+                                 ldd, batch_stride_d, multi_stride_d, bias, 0);
+    // Schedule
+    NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+                     const ITensorInfo                                   *a,
+                     const ITensorInfo                                   *b,
+                     const ITensorInfo                                   *c,
+                     ITensorInfo                                         *d,
+                     arm_gemm::Activation                                 activation,
+                     const AsmGemmInfo                                   &info)
+{
+    Params         p           = extract_parameters(a, b, d, info);
+    const CPUInfo &ci          = NEScheduler::get().cpu_info();
+    unsigned int   num_threads = NEScheduler::get().num_threads();
+
+    arm_gemm::GemmConfig cfg;
+    cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, info.accumulate, &cfg);
+
+    // Create arm_gemm fallback
+    auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
+    fallback->configure(a, b, c, d, args, info);
+    arm_gemm = std::move(fallback);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+                             const ITensorInfo                                   *a,
+                             const ITensorInfo                                   *b,
+                             const ITensorInfo                                   *c,
+                             ITensorInfo                                         *d,
+                             arm_gemm::Activation                                 activation,
+                             const AsmGemmInfo                                   &info)
+{
+    ARM_COMPUTE_UNUSED(activation);
+
+    Params             p           = extract_parameters(a, b, d, info);
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
+
+    arm_gemm::GemmConfig cfg;
+    cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, info.accumulate, &cfg);
+
+    // Create arm_gemm fallback
+    auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::DequantizeFloat>>();
+
+    // Configure requantization info
+    const GEMMLowpOutputStageInfo os_info = info.output_stage;
+
+    arm_gemm::DequantizeFloat gemm_dequant_info{};
+    gemm_dequant_info = arm_gemm::DequantizeFloat(d->quantization_info().uniform().scale);
+
+    fallback->configure(a, b, c, d, args, info, gemm_dequant_info);
+    arm_gemm = std::move(fallback);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+                           const ITensorInfo                                   *a,
+                           const ITensorInfo                                   *b,
+                           const ITensorInfo                                   *c,
+                           ITensorInfo                                         *d,
+                           arm_gemm::Activation                                 activation,
+                           const AsmGemmInfo                                   &info)
+{
+    ARM_COMPUTE_UNUSED(activation);
+    Params             p           = extract_parameters(a, b, d, info);
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
+
+    arm_gemm::GemmConfig cfg;
+    cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, info.accumulate, &cfg);
+
+    // Create arm_gemm fallback
+    auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
+
+    // Configure requantization info
+    const int32_t                 negation = info.negated_offsets ? 1 : -1;
+    const int32_t                 a_offset = -a->quantization_info().uniform().offset * negation;
+    const int32_t                 b_offset = -b->quantization_info().uniform().offset * negation;
+    const GEMMLowpOutputStageInfo os_info  = info.output_stage;
+
+    arm_gemm::Requantize32 gemm_requant_info{};
+    if (os_info.gemmlowp_shifts.size() > 1)
+    {
+        const auto requantize_data =
+            fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
+        gemm_requant_info = arm_gemm::Requantize32(
+            nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset,
+            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data),
+            std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+    }
+    else
+    {
+        gemm_requant_info =
+            arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift,
+                                   os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+    }
+
+    // Configure fallback
+    fallback->configure(a, b, c, d, args, info, gemm_requant_info);
+    arm_gemm = std::move(fallback);
+}
+} //namespace
+
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr)
+{
+}
+
+Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                             const ITensorInfo         *a,
+                                             const ITensorInfo         *b,
+                                             const ITensorInfo         *c,
+                                             const ITensorInfo         *d,
+                                             const AsmGemmInfo         &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+    ARM_COMPUTE_UNUSED(c);
+    arm_gemm::Activation act         = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
+    Params               p           = extract_parameters(a, b, d, info);
+    const CPUInfo       &ci          = NEScheduler::get().cpu_info();
+    unsigned int         num_threads = NEScheduler::get().num_threads();
+    arm_gemm::GemmConfig cfg;
+    cfg.weight_format                           = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
+    arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
+    arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
+                                info.fixed_format, info.fast_mode, info.accumulate, &cfg);
+    // TODO: Incorporate info.transpose_b COMPMID-6595
+    switch (a->data_type())
+    {
+        case DataType::F32:
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F32 input");
+            break;
+#ifdef __aarch64__
+        case DataType::U8:
+        case DataType::QASYMM8:
+            if (d->data_type() == DataType::S32)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8 input and U8 output");
+            }
+            break;
+        case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
+            if (d->data_type() == DataType::S32)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8 input and S8 output");
+            }
+            break;
+#endif /* __aarch64__ */
+
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+        case DataType::BFLOAT16:
+        {
+            if (d->data_type() == DataType::BFLOAT16)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<bfloat16, bfloat16, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for BFLOAT16 input and BFLOAT16 output");
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for BFLOAT16 input and F32 output");
+            }
+            break;
+        }
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+
+#if defined(ENABLE_FP16_KERNELS)
+        case DataType::F16:
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F16 input and F16 output");
+            break;
+#endif /* ENABLE_FP16_KERNELS */
+        default:
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel");
+            break;
+    }
+    expected_weight_format = assembly_utils::map_to_arm_compute_weight_format(arm_gemm_expected_wf);
+
+    return Status{};
+}
+
+Status CpuGemmAssemblyDispatch::validate(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+{
+    ARM_COMPUTE_UNUSED(c, info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run),
+                                    "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
+
+#ifndef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
+#endif /* __aarch64__ */
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+        DataType::BFLOAT16, DataType::F16, DataType::F32);
+    if (is_data_type_quantized_per_channel(b->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
+    }
+    else if (is_fixed_format_fast_math(info.weight_format))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32,
+                                    "Only F32 output supported for F32 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16,
+                                    "Only F16 output supported for F16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 &&
+                                        (d->data_type() != DataType::F32 && d->data_type() != DataType::BFLOAT16),
+                                    "Only F32/BFLOAT16 output supported for BFLOAT16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32,
+                                    "Only U32 output supported for U8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32,
+                                    "Only S32 output supported for S8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 &&
+                                        (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
+                                    "Only QASYMM8/S32 output supported for QASYMM8 input");
+    arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
+    const Status              ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info);
+    if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
+    {
+        // Correctness check: if the format expected by the kernel is
+        // not "any", make sure that the one found matches the format
+        // intended by the caller.
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (expected_weight_format != info.weight_format),
+            "The format expected by the kernel does not correspond with the one requested by the user.");
+    }
+    return ret;
+}
+
+bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
+{
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
+    return act.type != arm_gemm::Activation::Type::None;
+}
+
+void CpuGemmAssemblyDispatch::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
+
+    //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+    if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
+    {
+        return;
+    }
+
+    switch (a->data_type())
+    {
+        case DataType::F32:
+            create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
+            break;
+#ifdef __aarch64__
+        case DataType::U8:
+        case DataType::QASYMM8:
+            if (d->data_type() == DataType::S32)
+            {
+                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
+            }
+            else
+            {
+                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info);
+            }
+            break;
+        case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
+            if (d->data_type() == DataType::S32)
+            {
+                create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
+            }
+            else if (d->data_type() == DataType::F32)
+            {
+                create_arm_gemm_dequant<int8_t, float>(_arm_gemm, a, b, c, d, act, info);
+            }
+            else
+            {
+                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info);
+            }
+            break;
+#endif /* __aarch64__ */
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+        case DataType::BFLOAT16:
+            if (d->data_type() == DataType::BFLOAT16)
+            {
+                create_arm_gemm<bfloat16, bfloat16>(_arm_gemm, a, b, c, d, act, info);
+            }
+            else
+            {
+                create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info);
+            }
+            break;
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+#ifdef ENABLE_FP16_KERNELS
+        case DataType::F16:
+            create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info);
+            break;
+#endif /* ENABLE_FP16_KERNELS */
+        default:
+            break;
+    }
+}
+
+void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+    _arm_gemm->prepare(tensors);
+}
+
+bool CpuGemmAssemblyDispatch::is_configured() const
+{
+    return _arm_gemm && _arm_gemm->is_configured();
+}
+
+void CpuGemmAssemblyDispatch::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+    _arm_gemm->run(tensors);
+}
+
+experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const
+{
+    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+    return _arm_gemm->workspace();
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
new file mode 100644
index 0000000000..44c5c189a5
--- /dev/null
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
+#define ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/* Convolution method supported by the assembly gemm interface */
+enum class AsmConvMethod
+{
+    Im2Col,
+    Indirect,
+    Conv
+};
+
+struct AsmGemmInfo
+{
+    AsmConvMethod             method{AsmConvMethod::Im2Col};
+    PadStrideInfo             ps_info{};
+    ActivationLayerInfo       activation_info{};
+    GEMMLowpOutputStageInfo   output_stage{};
+    bool                      negated_offsets{true};
+    bool                      reinterpret_input_as_3d{false};
+    bool                      depth_output_gemm3d{false};
+    int64_t                   padding_top{0};
+    int64_t                   padding_left{0};
+    float                     padding_value{0.f};
+    bool                      fast_mode{false};
+    bool                      fixed_format{false};
+    arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
+    bool                      reshape_b_only_on_first_run{true};
+    bool                      accumulate{false};
+    /** Whether we want to perform an additional transpose of b before passing it to gemm or pretranspose_B_array
+     * @note This transpose b operation is also considered a form of "reshape" or "transform", so should be counted for
+     *       by the reshape_b_only_on_first_run flag
+     * @note This flag will be silently ignored (assumed to be false) when the weight_format is a fixed format. Because
+     *       fixed format kernels do not accept weights (B) with any prior transformations
+     */
+    bool transpose_b{false};
+};
+
+/** Assembly kernel glue */
+class CpuGemmAssemblyDispatch : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuGemmAssemblyDispatch();
+    /** Defautl destructor */
+    ~CpuGemmAssemblyDispatch() = default;
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch);
+
+    class IFallback
+    {
+    public:
+        virtual void                             run(ITensorPack &tensors)     = 0;
+        virtual void                             prepare(ITensorPack &tensors) = 0;
+        virtual experimental::MemoryRequirements workspace() const             = 0;
+        virtual bool                             is_configured() const         = 0;
+        virtual bool                             isVarWeightsKernel() const    = 0;
+        virtual ~IFallback()                                                   = default;
+    };
+
+public:
+    /** If supported create a Compute Library function else fallback to the arm_gemm function.
+     *
+     * @note Configuring "batches"
+     * The shapes of @p a @p b and @p d are arranged as follows:
+     *     Lowest dimension <-> Highest dimension
+     * a: [K, M, Batch, Multi]
+     * b: [N, K, Multi]
+     * d: [N, M, Batch, Multi]
+     *
+     * The "Batch" refers to where "Batch" number of MxK slices of tensor a multiplies with a single KxN slice of b
+     * The "Multi" refers to where "Multi" number of individual multiplication of a with b
+     *
+     * E.g. the following are some example input shape configurations
+     *
+     * (1) Normal 2D gemm
+     * a: [K=3, M=4]
+     * b: [N=5, K=3]
+     * d: [N=5, M=4]
+     *
+     * (2) Batches of a sharing b (e.g. gemm-based batched convolution where b is the shared )
+     * a: [K=3, M=4, Batch=9]
+     * b: [N=5, K=3]
+     * d: [N=5, M=4, Batch=9]
+     *
+     * (3) "Batches" of independent gemm (e.g. batched matmul)
+     * a: [K=3, M=4, Batch=1, Multi=7]
+     * b: [N=5, K=3, Multi=7]
+     * d: [N=5, M=4, Batch=1, Multi=7]
+     *
+     * (4) "Batches" of independent gemm where b is also shared
+     * a: [K=3, M=4, Batch=4, Multi=7]
+     * b: [N=5, K=3, Multi=7]
+     * d: [N=5, M=4, Batch=4, Multi=7]
+     *
+     * @param[in]  a    Input tensor (Matrix A)
+     * @param[in]  b    Input tensor (Matrix B)
+     * @param[in]  c    Input tensor (Matrix C) used to pass the bias for quantized calculations
+     * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  info GEMM meta-data
+     */
+    void configure(
+        const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
+
+    /** Indicates whether or not this function can be used to process the given parameters.
+     *
+     * @param[in] a    Input tensor info (Matrix A)
+     * @param[in] b    Input tensor info (Matrix B)
+     * @param[in] c    Input tensor info (Matrix C) used to pass the bias for quantized calculations
+     * @param[in] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in] info GEMM meta-data
+     *
+     * @return a status.
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           const AsmGemmInfo &info);
+
+    /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
+     *
+     * This method has the same use of @ref
+     * NEGEMMConvolutionLayer::has_opt_impl, with the only caveat that
+     * the value of arm_compute::WeightFormat need to be passed via the
+     * parameter info.
+     *
+     * @return a status.
+     */
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const AsmGemmInfo         &info);
+    /** Checks if activation is supported by the gemm assembly dispatcher
+     *
+     * @param[in] activation Activation to check
+     *
+     * @return True if activation is supported else false
+     */
+    static bool is_activation_supported(const ActivationLayerInfo &activation);
+    /** Was the function successfully configured ?
+     *
+     * @return True if the function is configured and ready to run
+     */
+    bool is_configured() const;
+    /** Indicates if the convolution executes in variable weights mode.
+     *
+     * Similar to @ref CpuGemm::isVarWeightsKernel
+     */
+    bool isVarWeightsKernel() const
+    {
+        return _arm_gemm && _arm_gemm->isVarWeightsKernel();
+    }
+
+    // Inherited methods overridden:
+    void                             prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H