422 files changed, 21610 insertions, 26635 deletions
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index bf219514fc..eca712dbf0 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,10 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/Allocator.h"
-#include "arm_compute/runtime/MemoryRegion.h"
 
 #include "arm_compute/core/Error.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 
 #include <cstddef>
 
@@ -44,5 +43,5 @@ void Allocator::free(void *ptr)
 
 std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
 {
-    return arm_compute::support::cpp14::make_unique<MemoryRegion>(size, alignment);
+    return std::make_unique<MemoryRegion>(size, alignment);
 }
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 6bbb731d5d..8a0fc05c39 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,16 +27,15 @@
 #include "arm_compute/runtime/BlobMemoryPool.h"
 #include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
-#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <cmath>
+#include <iterator>
 #include <map>
 
 namespace arm_compute
 {
-BlobLifetimeManager::BlobLifetimeManager()
-    : _blobs()
+BlobLifetimeManager::BlobLifetimeManager() : _blobs()
 {
 }
 
@@ -48,7 +47,7 @@ const BlobLifetimeManager::info_type &BlobLifetimeManager::info() const
 std::unique_ptr<IMemoryPool> BlobLifetimeManager::create_pool(IAllocator *allocator)
 {
     ARM_COMPUTE_ERROR_ON(allocator == nullptr);
-    return support::cpp14::make_unique<BlobMemoryPool>(allocator, _blobs);
+    return std::make_unique<BlobMemoryPool>(allocator, _blobs);
 }
 
 MappingType BlobLifetimeManager::mapping_type() const
@@ -62,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings()
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
     // Sort free blobs requirements in descending order.
-    _free_blobs.sort([](const Blob & ba, const Blob & bb)
-    {
-        return ba.max_size > bb.max_size;
-    });
+    _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; });
 
     // Create group sizes vector
     std::vector<BlobInfo> group_sizes;
-    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
-    {
-        return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() };
-    });
+    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes),
+                   [](const Blob &b) {
+                       return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()};
+                   });
 
     // Update blob sizes
     size_t max_size = std::max(_blobs.size(), group_sizes.size());
     _blobs.resize(max_size);
     group_sizes.resize(max_size);
-    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
-    {
-        return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) };
-    });
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs),
+                   [](BlobInfo lhs, BlobInfo rhs)
+                   {
+                       return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment),
+                                       std::max(lhs.owners, rhs.owners)};
+                   });
 
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
     int   blob_idx       = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 907b39f9c6..a2f63ef52b 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/IMemoryPool.h"
 #include "arm_compute/runtime/Types.h"
-#include "support/MemorySupport.h"
 
 #include <vector>
 
@@ -42,14 +41,13 @@ BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<BlobInfo> blob
 
 BlobMemoryPool::~BlobMemoryPool()
 {
-    ARM_COMPUTE_ERROR_ON(!_allocator);
     free_blobs();
 }
 
 void BlobMemoryPool::acquire(MemoryMappings &handles)
 {
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(_blobs[handle.second].get());
@@ -58,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles)
 
 void BlobMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);
@@ -73,14 +71,14 @@ MappingType BlobMemoryPool::mapping_type() const
 std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
-    return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_info);
+    return std::make_unique<BlobMemoryPool>(_allocator, _blob_info);
 }
 
 void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
 
-    for(const auto &bi : blob_info)
+    for (const auto &bi : blob_info)
     {
         _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
     }
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index f50d10034c..b4545b93bf 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,34 +23,20 @@
  */
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
 
-#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLMemoryRegion.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
 
 #include <cstddef>
 
 namespace arm_compute
 {
-CLBufferAllocator::CLBufferAllocator(CLCoreRuntimeContext *ctx)
-    : _ctx(ctx)
-{
-}
-
 void *CLBufferAllocator::allocate(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    cl_mem buf;
-    if(_ctx == nullptr)
-    {
-        buf = clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
-    }
-    else
-    {
-        buf = clCreateBuffer(_ctx->context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
-    }
+    cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size,
+                              nullptr, nullptr)};
     return static_cast<void *>(buf);
 }
 
@@ -63,6 +49,6 @@ void CLBufferAllocator::free(void *ptr)
 std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+    return std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/CLDistribution1D.cpp b/src/runtime/CL/CLDistribution1D.cpp
deleted file mode 100644
index f1dd95e77e..0000000000
--- a/src/runtime/CL/CLDistribution1D.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLDistribution1D.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLDistribution1D::CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
-    : ICLDistribution1D(num_bins, offset, range), _mem(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, num_bins * sizeof(int32_t))
-{
-}
-
-void CLDistribution1D::map(bool blocking)
-{
-    ICLDistribution1D::map(CLScheduler::get().queue(), blocking);
-}
-
-void CLDistribution1D::unmap()
-{
-    ICLDistribution1D::unmap(CLScheduler::get().queue());
-}
-
-uint32_t *CLDistribution1D::do_map(cl::CommandQueue &q, bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
-    return static_cast<uint32_t *>(q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
-}
-
-void CLDistribution1D::do_unmap(cl::CommandQueue &q)
-{
-    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
-    q.enqueueUnmapMemObject(_mem, _mapping);
-}
-
-cl::Buffer &CLDistribution1D::cl_buffer()
-{
-    return _mem;
-}
diff --git a/src/runtime/DeviceProperties.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
index e88aa7124c..d680dc08bb 100644
--- a/src/runtime/DeviceProperties.cpp
+++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/DeviceProperties.h"
+#include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
 
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
 namespace arm_compute
 {
-DeviceProperties::DeviceProperties()
+CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
 {
-    get_cpu_configuration(cpu_info);
 }
+CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default;
+
+bool CLGEMMHeuristicsHandle::reload_from_file(const std::string &filename)
+{
+    return _heuristics->reload_from_file(filename);
+}
+const mlgo::MLGOHeuristics *CLGEMMHeuristicsHandle::get() const
+{
+    return _heuristics.get();
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
deleted file mode 100644
index c4ea6398e5..0000000000
--- a/src/runtime/CL/CLHOG.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/CLHOG.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLHOG::CLHOG()
-    : _info(), _buffer()
-{
-}
-
-void CLHOG::init(const HOGInfo &input)
-{
-    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
-    _info   = input;
-    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float));
-}
-
-void CLHOG::free()
-{
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-
-    _buffer = cl::Buffer();
-}
-
-const HOGInfo *CLHOG::info() const
-{
-    return &_info;
-}
-
-const cl::Buffer &CLHOG::cl_buffer() const
-{
-    return _buffer;
-}
-
-void CLHOG::map(bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(descriptor() != nullptr);
-    ICLHOG::map(CLScheduler::get().queue(), blocking);
-}
-
-void CLHOG::unmap()
-{
-    ARM_COMPUTE_ERROR_ON(descriptor() == nullptr);
-    ICLHOG::unmap(CLScheduler::get().queue());
-}
-
-uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size() * sizeof(float)));
-}
-
-void CLHOG::do_unmap(cl::CommandQueue &q)
-{
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    q.enqueueUnmapMemObject(_buffer, descriptor());
-}
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index c4c7ee2107..eb28ecbf8d 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 
 namespace
@@ -49,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void
  * @return A pointer to the context properties which can be used to create an opencl context
  */
 
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
+void initialise_context_properties(const cl::Platform                   &platform,
+                                   const cl::Device                     &device,
+                                   std::array<cl_context_properties, 7> &prop)
 {
     ARM_COMPUTE_UNUSED(device);
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // Query devices in the context for cl_arm_printf support
-    if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+    if (arm_compute::device_supports_extension(device, "cl_arm_printf"))
     {
         // Create a cl_context with a printf_callback and user specified buffer size.
-        std::array<cl_context_properties, 7> properties_printf =
-        {
+        std::array<cl_context_properties, 7> properties_printf = {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             // Enable a printf callback function for this context.
             CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
             // Request a minimum printf buffer size of 4MB for devices in the
             // context that support this extension.
-            CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
-            0
-        };
+            CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0};
         prop = properties_printf;
     }
     else
 #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
     {
-        std::array<cl_context_properties, 3> properties =
-        {
-            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
-            0
-        };
+        std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM,
+                                                           reinterpret_cast<cl_context_properties>(platform()), 0};
         std::copy(properties.begin(), properties.end(), prop.begin());
     };
 }
@@ -84,21 +81,54 @@ void initialise_context_properties(const cl::Platform &platform, const cl::Devic
 
 namespace arm_compute
 {
-std::tuple<cl::Context, cl::Device, cl_int>
-create_opencl_context_and_device()
+cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
 {
-    ARM_COMPUTE_ERROR_ON(!opencl_is_available());
     std::vector<cl::Platform> platforms;
     cl::Platform::get(&platforms);
     ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
-    cl::Platform            p = platforms[0];
+
+    cl::Platform selected_platform{nullptr};
+
+    // If the user has selected the Native platform, return the first available.
+    switch (cl_backend_type)
+    {
+        case CLBackendType::Native:
+            selected_platform = platforms[0];
+            break;
+        case CLBackendType::Clvk:
+            for (auto p : platforms)
+            {
+                std::string res = p.getInfo<CL_PLATFORM_NAME>();
+                if (res.find("clvk") != std::string::npos)
+                {
+                    selected_platform = p;
+                    break;
+                }
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported backend type");
+    }
+
+    if (!selected_platform())
+    {
+        ARM_COMPUTE_ERROR("No valid platform found");
+    }
+
+    return selected_platform;
+}
+
+std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type)
+{
+    ARM_COMPUTE_ERROR_ON(!opencl_is_available());
+    cl::Platform            p = select_preferable_platform(cl_backend_type);
     cl::Device              device;
     std::vector<cl::Device> platform_devices;
     p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
     ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-    device     = platform_devices[0];
-    cl_int err = CL_SUCCESS;
-    std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
+    device                                          = platform_devices[0];
+    cl_int                               err        = CL_SUCCESS;
+    std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0};
     initialise_context_properties(p, device, properties);
     cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
@@ -108,7 +138,7 @@ create_opencl_context_and_device()
 void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
         ctx->gpu_scheduler()->enqueue(*kernel, flush);
diff --git a/src/runtime/CL/CLLut.cpp b/src/runtime/CL/CLLut.cpp
deleted file mode 100644
index a8cbf2131f..0000000000
--- a/src/runtime/CL/CLLut.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLLut.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <cstring>
-
-using namespace arm_compute;
-
-CLLut::CLLut()
-    : _allocator()
-{
-}
-
-CLLut::CLLut(size_t num_elements, DataType data_type)
-    : _allocator()
-{
-    _allocator.init(num_elements, data_type);
-}
-
-size_t CLLut::num_elements() const
-{
-    return _allocator.num_elements();
-}
-
-uint32_t CLLut::index_offset() const
-{
-    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
-}
-
-size_t CLLut::size_in_bytes() const
-{
-    return _allocator.size();
-}
-
-DataType CLLut::type() const
-{
-    return _allocator.type();
-}
-
-const cl::Buffer &CLLut::cl_buffer() const
-{
-    return _allocator.cl_data();
-}
-
-void CLLut::clear()
-{
-    cl::CommandQueue &q    = CLScheduler::get().queue();
-    uint8_t          *data = _allocator.map(q, true /* blocking */);
-    std::memset(data, 0, size_in_bytes());
-    _allocator.unmap(q, data);
-}
-
-ILutAllocator *CLLut::allocator()
-{
-    return &_allocator;
-}
-
-void CLLut::map(bool blocking)
-{
-    ICLLut::map(CLScheduler::get().queue(), blocking);
-}
-
-void CLLut::unmap()
-{
-    ICLLut::unmap(CLScheduler::get().queue());
-}
-
-uint8_t *CLLut::do_map(cl::CommandQueue &q, bool blocking)
-{
-    return _allocator.map(q, blocking);
-}
-
-void CLLut::do_unmap(cl::CommandQueue &q)
-{
-    _allocator.unmap(q, buffer());
-}
diff --git a/src/runtime/CL/CLLutAllocator.cpp b/src/runtime/CL/CLLutAllocator.cpp
deleted file mode 100644
index 311de4bb8d..0000000000
--- a/src/runtime/CL/CLLutAllocator.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLLutAllocator.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLLutAllocator::CLLutAllocator()
-    : _buffer(), _mapping(nullptr)
-{
-}
-
-uint8_t *CLLutAllocator::data()
-{
-    return _mapping;
-}
-
-const cl::Buffer &CLLutAllocator::cl_data() const
-{
-    return _buffer;
-}
-
-uint8_t *CLLutAllocator::map(cl::CommandQueue &q, bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
-}
-
-void CLLutAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
-{
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    q.enqueueUnmapMemObject(_buffer, mapping);
-}
-
-void CLLutAllocator::allocate()
-{
-    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size());
-}
-
-uint8_t *CLLutAllocator::lock()
-{
-    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    cl::CommandQueue q = CLScheduler::get().queue();
-    _mapping           = map(q, true);
-    return _mapping;
-}
-
-void CLLutAllocator::unlock()
-{
-    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
-    cl::CommandQueue q = CLScheduler::get().queue();
-    unmap(q, _mapping);
-    _mapping = nullptr;
-}
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index 557378b6f1..c6ee6fde83 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,24 +24,22 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+
+#include "support/Cast.h"
 
 namespace arm_compute
 {
-CLMemory::CLMemory()
-    : _region(nullptr), _region_owned(nullptr)
+CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-CLMemory::CLMemory(ICLMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }
@@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
     _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
     _region       = _region_owned.get();
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 7ae16ec6fc..c9ddf9b85c 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,15 @@
  */
 #include "arm_compute/runtime/CL/CLMemoryRegion.h"
 
-#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
-ICLMemoryRegion::ICLMemoryRegion(CLCoreRuntimeContext *ctx, size_t size)
-    : IMemoryRegion(size),
-      _queue((ctx != nullptr) ? ctx->queue() : CLScheduler::get().queue()),
-      _ctx((ctx != nullptr) ? ctx->context() : CLScheduler::get().context()),
-      _mapping(nullptr),
-      _mem()
+ICLMemoryRegion::ICLMemoryRegion(size_t size)
+    : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem()
 {
 }
 
@@ -59,21 +56,34 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset,
     return nullptr;
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size)
-    : ICLMemoryRegion(ctx, size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size)
 {
-    if(_size != 0)
+    if (_size != 0)
     {
-        _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), flags, _size);
+        _mem = cl::Buffer(CLScheduler::get().context(), flags, _size);
     }
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer, CLCoreRuntimeContext *ctx)
-    : ICLMemoryRegion(ctx, buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
 {
     _mem = buffer;
 }
 
+CLBufferMemoryRegion::~CLBufferMemoryRegion()
+{
+    // Flush the command queue to ensure all commands that may use this memory buffer are scheduled to be finished before
+    // this buffer is freed
+    // Do not call finish as it is a blocking call which affects the performance
+    try
+    {
+        CLScheduler::get().queue().flush();
+    }
+    catch (const std::exception &e)
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL(e.what());
+    }
+}
+
 void *CLBufferMemoryRegion::ptr()
 {
     return nullptr;
@@ -93,30 +103,33 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
     _mapping = nullptr;
 }
 
-ICLSVMMemoryRegion::ICLSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLMemoryRegion(ctx, size), _ptr(nullptr)
+ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLMemoryRegion(size), _ptr(nullptr)
 {
-    if(size != 0)
+    if (size != 0)
     {
-        _ptr = clSVMAlloc((ctx != nullptr) ? ctx->context().get() : CLScheduler::get().context().get(), flags, size, alignment);
-        if(_ptr != nullptr)
+        _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment);
+        if (_ptr != nullptr)
         {
-            _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
+            _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
         }
     }
 }
 
 ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
 {
-    if(_ptr != nullptr)
+    if (_ptr != nullptr)
     {
         try
         {
-            clFinish(_queue.get());
+            // Can only use the blocking finish instead of the non-blocking flush here, because clSVMFree requires all
+            // commands that may use the svm pointer to finish beforehand
+            // https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clSVMFree.html
+            clFinish(CLScheduler::get().queue().get());
             _mem = cl::Buffer();
             clSVMFree(_ctx.get(), _ptr);
         }
-        catch(...)
+        catch (...)
         {
         }
     }
@@ -127,15 +140,16 @@ void *ICLSVMMemoryRegion::ptr()
     return _ptr;
 }
 
-CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLSVMMemoryRegion(ctx, flags, size, alignment)
+CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(flags, size, alignment)
 {
 }
 
 void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
-    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr,
+                    nullptr);
     _mapping = _ptr;
     return _mapping;
 }
@@ -147,14 +161,14 @@ void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q)
     _mapping = nullptr;
 }
 
-CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLSVMMemoryRegion(ctx, flags, size, alignment)
+CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(flags, size, alignment)
 {
 }
 
 void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
-    if(blocking)
+    if (blocking)
     {
         clFinish(q.get());
     }
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
deleted file mode 100644
index 14cd68a646..0000000000
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLMultiHOG.h"
-
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/Error.h"
-
-using namespace arm_compute;
-
-CLMultiHOG::CLMultiHOG(size_t num_models)
-    : _num_models(num_models), _model()
-{
-    _model.resize(_num_models);
-}
-
-size_t CLMultiHOG::num_models() const
-{
-    return _num_models;
-}
-
-ICLHOG *CLMultiHOG::cl_model(size_t index)
-{
-    ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (&_model[index]);
-}
-
-const ICLHOG *CLMultiHOG::cl_model(size_t index) const
-{
-    ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (&_model[index]);
-}
diff --git a/src/runtime/CL/CLMultiImage.cpp b/src/runtime/CL/CLMultiImage.cpp
deleted file mode 100644
index 92254f3531..0000000000
--- a/src/runtime/CL/CLMultiImage.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLMultiImage.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-using namespace arm_compute;
-
-CLMultiImage::CLMultiImage()
-    : _info(), _plane()
-{
-}
-
-const MultiImageInfo *CLMultiImage::info() const
-{
-    return &_info;
-}
-
-void CLMultiImage::init(unsigned int width, unsigned int height, Format format)
-{
-    internal_init(width, height, format, false);
-}
-
-void CLMultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
-{
-    internal_init(width, height, format, true);
-}
-
-void CLMultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
-{
-    TensorShape shape = adjust_odd_shape(TensorShape{ width, height }, format);
-    TensorInfo  info(shape, Format::U8);
-
-    if(auto_padding)
-    {
-        info.auto_padding();
-    }
-
-    switch(format)
-    {
-        case Format::U8:
-        case Format::S16:
-        case Format::U16:
-        case Format::S32:
-        case Format::F16:
-        case Format::F32:
-        case Format::U32:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-        {
-            TensorInfo info_full(shape, format);
-
-            if(auto_padding)
-            {
-                info_full.auto_padding();
-            }
-
-            std::get<0>(_plane).allocator()->init(info_full);
-            break;
-        }
-        case Format::NV12:
-        case Format::NV21:
-        {
-            const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88);
-            TensorInfo        info_uv88(shape_uv88, Format::UV88);
-
-            if(auto_padding)
-            {
-                info_uv88.auto_padding();
-            }
-
-            std::get<0>(_plane).allocator()->init(info);
-            std::get<1>(_plane).allocator()->init(info_uv88);
-            break;
-        }
-        case Format::IYUV:
-        {
-            const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV);
-            TensorInfo        info_sub2(shape_sub2, Format::U8);
-
-            if(auto_padding)
-            {
-                info_sub2.auto_padding();
-            }
-
-            std::get<0>(_plane).allocator()->init(info);
-            std::get<1>(_plane).allocator()->init(info_sub2);
-            std::get<2>(_plane).allocator()->init(info_sub2);
-            break;
-        }
-        case Format::YUV444:
-            std::get<0>(_plane).allocator()->init(info);
-            std::get<1>(_plane).allocator()->init(info);
-            std::get<2>(_plane).allocator()->init(info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    _info.init(shape.x(), shape.y(), format);
-}
-
-void CLMultiImage::allocate()
-{
-    switch(_info.format())
-    {
-        case Format::U8:
-        case Format::S16:
-        case Format::U16:
-        case Format::S32:
-        case Format::F16:
-        case Format::F32:
-        case Format::U32:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            std::get<0>(_plane).allocator()->allocate();
-            break;
-        case Format::NV12:
-        case Format::NV21:
-            std::get<0>(_plane).allocator()->allocate();
-            std::get<1>(_plane).allocator()->allocate();
-            break;
-        case Format::IYUV:
-        case Format::YUV444:
-            std::get<0>(_plane).allocator()->allocate();
-            std::get<1>(_plane).allocator()->allocate();
-            std::get<2>(_plane).allocator()->allocate();
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-}
-
-CLImage *CLMultiImage::cl_plane(unsigned int index)
-{
-    return &_plane[index];
-}
-
-const CLImage *CLMultiImage::cl_plane(unsigned int index) const
-{
-    return &_plane[index];
-}
diff --git a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp b/src/runtime/CL/CLOperator.cpp
index 4fb4e8b86d..89d4520038 100644
--- a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
+++ b/src/runtime/CL/CLOperator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,22 +21,37 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
 
-#include "support/MemorySupport.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
-void NEComputeAllAnchors::configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info)
+namespace experimental
 {
-    // Configure ComputeAllAnchors kernel
-    auto k = arm_compute::support::cpp14::make_unique<NEComputeAllAnchorsKernel>();
-    k->configure(anchors, all_anchors, info);
-    _kernel = std::move(k);
+ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
+{
+}
+
+void ICLOperator::run(ITensorPack &tensors)
+{
+    if (tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors, false);
+}
+
+void ICLOperator::prepare(ITensorPack &constants)
+{
+    ARM_COMPUTE_UNUSED(constants);
 }
 
-Status NEComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+MemoryRequirements ICLOperator::workspace() const
 {
-    return NEComputeAllAnchorsKernel::validate(anchors, all_anchors, info);
+    return {};
 }
+} // namespace experimental
 } // namespace arm_compute
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
deleted file mode 100644
index ef8a1e5294..0000000000
--- a/src/runtime/CL/CLPyramid.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/CLPyramid.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-
-#include <array>
-#include <cmath>
-
-using namespace arm_compute;
-
-CLPyramid::CLPyramid()
-    : _info(), _pyramid()
-{
-}
-
-void CLPyramid::init(const PyramidInfo &info)
-{
-    internal_init(info, false);
-}
-
-void CLPyramid::init_auto_padding(const PyramidInfo &info)
-{
-    internal_init(info, true);
-}
-
-void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
-{
-    _info = info;
-    _pyramid.resize(_info.num_levels());
-
-    size_t      w            = _info.width();
-    size_t      h            = _info.height();
-    size_t      ref_w        = w;
-    size_t      ref_h        = h;
-    const bool  is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
-    TensorShape tensor_shape = _info.tensor_shape();
-
-    // Note: Look-up table used by the OpenVX sample implementation
-    const std::array<float, 4> c_orbscale =
-    {
-        {
-            0.5f,
-            SCALE_PYRAMID_ORB,
-            SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
-            SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
-        }
-    };
-
-    for(size_t i = 0; i < _info.num_levels(); ++i)
-    {
-        TensorInfo tensor_info(tensor_shape, _info.format());
-
-        if(auto_padding)
-        {
-            tensor_info.auto_padding();
-        }
-
-        _pyramid[i].allocator()->init(tensor_info);
-
-        if(is_orb_scale)
-        {
-            const float orb_scale = c_orbscale[(i + 1) % 4];
-            w                     = std::ceil(ref_w * orb_scale);
-            h                     = std::ceil(ref_h * orb_scale);
-
-            if(0 == ((i + 1) % 4))
-            {
-                ref_w = w;
-                ref_h = h;
-            }
-        }
-        else
-        {
-            w = (w + 1) * _info.scale();
-            h = (h + 1) * _info.scale();
-        }
-
-        // Update tensor_shape
-        tensor_shape.set(0, w);
-        tensor_shape.set(1, h);
-    }
-}
-
-void CLPyramid::allocate()
-{
-    for(size_t i = 0; i < _info.num_levels(); ++i)
-    {
-        _pyramid[i].allocator()->allocate();
-    }
-}
-
-const PyramidInfo *CLPyramid::info() const
-{
-    return &_info;
-}
-
-CLTensor *CLPyramid::get_pyramid_level(size_t index) const
-{
-    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
-
-    return &_pyramid[index];
-}
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 4d70edac2f..b426b8c304 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -29,10 +30,13 @@
 namespace arm_compute
 {
 CLRuntimeContext::CLRuntimeContext()
-    : _gpu_owned_scheduler(support::cpp14::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _core_context()
+    : _gpu_owned_scheduler(std::make_unique<CLScheduler>()),
+      _gpu_scheduler(_gpu_owned_scheduler.get()),
+      _symbols(),
+      _backend_type()
 {
     _symbols.load_default();
-    auto ctx_dev_err = create_opencl_context_and_device();
+    auto ctx_dev_err = create_opencl_context_and_device(_backend_type);
     ARM_COMPUTE_ERROR_ON_MSG(std::get<2>(ctx_dev_err) != CL_SUCCESS, "Failed to create OpenCL context");
     auto             ctx   = std::get<0>(ctx_dev_err);
     auto             dev   = std::get<1>(ctx_dev_err);
@@ -40,7 +44,6 @@ CLRuntimeContext::CLRuntimeContext()
     _gpu_owned_scheduler->init(ctx, queue, dev, &_tuner);
     const std::string cl_kernels_folder("./cl_kernels");
     CLKernelLibrary::get().init(cl_kernels_folder, ctx, dev);
-    _core_context = CLCoreRuntimeContext(&CLKernelLibrary::get(), _gpu_owned_scheduler->context(), _gpu_owned_scheduler->queue());
 }
 
 CLKernelLibrary &CLRuntimeContext::kernel_library()
@@ -48,11 +51,6 @@ CLKernelLibrary &CLRuntimeContext::kernel_library()
     return CLKernelLibrary::get();
 }
 
-CLCoreRuntimeContext *CLRuntimeContext::core_runtime_context()
-{
-    return &_core_context;
-}
-
 void CLRuntimeContext::set_gpu_scheduler(CLScheduler *scheduler)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scheduler);
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index e78eaa482f..f0a42f55fd 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,9 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/tuners/Tuners.h"
+
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -49,6 +48,11 @@ GPUTarget CLScheduler::target() const
     return _target;
 }
 
+CLGEMMHeuristicsHandle *CLScheduler::gemm_heuristics() const
+{
+    return _gemm_heuristics;
+}
+
 void CLScheduler::set_queue(cl::CommandQueue queue)
 {
     _queue = std::move(queue);
@@ -78,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event()
 
 void CLScheduler::tune_kernel_static(ICLKernel &kernel)
 {
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         _cl_tuner->tune_kernel_static(kernel);
     }
@@ -92,7 +96,16 @@ bool CLScheduler::is_initialised() const
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _cl_default_static_tuner(nullptr)
+    : _context(),
+      _queue(),
+      _target(GPUTarget::MIDGARD),
+      _is_initialised(false),
+      _cl_tuner(nullptr),
+      _gemm_heuristics(nullptr),
+      _backend_type(CLBackendType::Native),
+      _job_chaining_enabled(true),
+      _job_chaining_size(1),
+      _job_chaining_count(0)
 {
 }
 
@@ -103,37 +116,45 @@ CLScheduler &CLScheduler::get()
     return scheduler;
 }
 
-void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner)
+void CLScheduler::default_init_with_context(cl::Device             &device,
+                                            cl::Context            &ctx,
+                                            ICLTuner               *cl_tuner,
+                                            CLGEMMHeuristicsHandle *gemm_h)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         const std::string cl_kernels_folder("./cl_kernels/");
         cl::CommandQueue  queue = cl::CommandQueue(ctx, device);
         CLKernelLibrary::get().init(cl_kernels_folder, ctx, device);
-        init(ctx, queue, device, cl_tuner);
-        _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
-        _cl_tuner                = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
+        init(ctx, queue, device, cl_tuner, gemm_h);
+        _cl_tuner = cl_tuner;
     }
 }
 
-void CLScheduler::default_init(ICLTuner *cl_tuner)
+void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         cl::Context ctx;
         cl::Device  dev;
         cl_int      err;
-        std::tie(ctx, dev, err) = create_opencl_context_and_device();
+        std::tie(ctx, dev, err) = create_opencl_context_and_device(cl_backend_type);
         ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
         cl::CommandQueue queue = cl::CommandQueue(ctx, dev);
         CLKernelLibrary::get().init("./cl_kernels/", ctx, dev);
-        init(ctx, queue, dev, cl_tuner);
-        // Create a default static tuner and set if none was provided
-        _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
+        init(ctx, queue, dev, cl_tuner, gemm_h);
     }
 
-    // Set CL tuner
-    _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
+    // Set CL tuner and GEMM heuristics
+    _cl_tuner        = cl_tuner;
+    _gemm_heuristics = gemm_h;
+}
+
+void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+{
+    _is_initialised = false;
+
+    default_init(cl_tuner, gemm_h, cl_backend_type);
 }
 
 void CLScheduler::set_context(cl::Context context)
@@ -142,34 +163,86 @@ void CLScheduler::set_context(cl::Context context)
     CLKernelLibrary::get().set_context(_context);
 }
 
-void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner)
+void CLScheduler::init(cl::Context             context,
+                       cl::CommandQueue        queue,
+                       const cl::Device       &device,
+                       ICLTuner               *cl_tuner,
+                       CLGEMMHeuristicsHandle *gemm_h,
+                       CLBackendType           cl_backend_type)
 {
     set_context(std::move(context));
-    _queue          = std::move(queue);
-    _target         = get_target_from_device(device);
-    _is_initialised = true;
-    _cl_tuner       = cl_tuner;
+    _queue           = std::move(queue);
+    _target          = get_target_from_device(device);
+    _is_initialised  = true;
+    _cl_tuner        = cl_tuner;
+    _gemm_heuristics = gemm_h;
+    _backend_type    = cl_backend_type;
 }
 
-void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
+void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
-                             "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+    ARM_COMPUTE_ERROR_ON_MSG(
+        !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
+    const bool inject_memory = !tensors.empty();
+
     // Tune the kernel if the CLTuner has been provided
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
-        // Tune the OpenCL kernel
-        _cl_tuner->tune_kernel_dynamic(kernel);
+        inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel
-    kernel.run(kernel.window(), _queue);
+    inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
+    if (_job_chaining_enabled)
+    {
+        ++_job_chaining_count;
+    }
+
+    flush_queue(flush);
+}
 
-    if(flush)
+void CLScheduler::flush_queue(bool flush)
+{
+    if (_job_chaining_enabled)
+    {
+        if (_job_chaining_count >= _job_chaining_size)
+        {
+            _job_chaining_count = 0;
+            /*
+                Optimisation note: Flush the queue at the first enqueue to start the GPU
+                execution and then incrementally saturate the clFlush calls to minimize
+                the CPU activity for job-scheduling.
+                For eg. job-chain size goes from 1, 2, 4, 8 and 16
+            */
+            if (_job_chaining_size < 16)
+            {
+                _job_chaining_size <<= 1;
+            }
+            _queue.flush();
+        }
+    }
+    else if (flush)
     {
         _queue.flush();
     }
 }
+
+void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
+{
+    ITensorPack pack;
+    enqueue_common(kernel, pack, flush);
+}
+
+void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush)
+{
+    enqueue_common(kernel, tensors, flush);
+}
+
+void CLScheduler::enable_job_chaining(int job_chaining_size)
+{
+    _job_chaining_enabled = true;
+    _job_chaining_size    = job_chaining_size;
+}
 } // namespace arm_compute
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 0f362507cf..ace820bbb7 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,12 +29,14 @@
 
 using namespace arm_compute;
 
-CLSubTensor::CLSubTensor()
-    : _parent(nullptr), _info()
+CLSubTensor::CLSubTensor() : _parent(nullptr), _info()
 {
 }
 
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
+CLSubTensor::CLSubTensor(ICLTensor         *parent,
+                         const TensorShape &tensor_shape,
+                         const Coordinates &coords,
+                         bool               extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
@@ -81,11 +83,15 @@ void CLSubTensor::unmap()
 uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
-    return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+    if (_parent->buffer() == nullptr)
+    {
+        _parent->map(q, blocking);
+    }
+    return _parent->buffer();
 }
 
 void CLSubTensor::do_unmap(cl::CommandQueue &q)
 {
     ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
-    q.enqueueUnmapMemObject(cl_buffer(), buffer());
+    _parent->unmap(q);
 }
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index a6d0cf77ca..db94639190 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index eaf46d42ca..e6457218c7 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,34 +31,33 @@
 namespace arm_compute
 {
 const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer();
-
 namespace
 {
+/** Global user-defined allocator that can be used for all internal allocations of a CLTensor */
+static IAllocator *static_global_cl_allocator = nullptr;
+
 /** Helper function used to allocate the backing memory of a tensor
  *
- * @param[in] context   OpenCL context to use
  * @param[in] size      Size of the allocation
  * @param[in] alignment Alignment of the allocation
  *
  * @return A wrapped memory region
  */
-std::unique_ptr<ICLMemoryRegion> allocate_region(CLCoreRuntimeContext *ctx, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
-    std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(ctx,
-                                                                                                 CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
-                                                                                                 size,
-                                                                                                 alignment);
+    std::unique_ptr<ICLMemoryRegion> region =
+        std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
 
     // Try coarse-grain SVM in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
-        region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(ctx, CL_MEM_READ_WRITE, size, alignment);
+        region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment);
     }
     // Try legacy buffer memory in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
-        region = support::cpp14::make_unique<CLBufferMemoryRegion>(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+        region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
     }
     return region;
 }
@@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset)
  * @param[in]      qinfo    Quantization info
  * @param[in]      pad_size Pad size to use in case array needs to be padded for computation purposes
  */
-void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size)
+void populate_quantization_info(CLFloatArray           &scale,
+                                CLInt32Array           &offset,
+                                const QuantizationInfo &qinfo,
+                                size_t                  pad_size)
 {
     clear_quantization_arrays(scale, offset);
 
@@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
     const size_t              element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
     scale                                  = CLFloatArray(num_elements + pad_size);
     scale.resize(num_elements);
-    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
+    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size,
+                                                  qinfo.scale().data());
 
-    if(!qinfo.offset().empty())
+    if (!qinfo.offset().empty())
     {
         // Create offset array
-        const std::vector<int32_t> &qoffset             = qinfo.offset();
-        const size_t                offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
-        offset                                          = CLInt32Array(num_elements + pad_size);
+        const std::vector<int32_t> &qoffset = qinfo.offset();
+        const size_t offset_element_size    = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
+        offset                              = CLInt32Array(num_elements + pad_size);
         offset.resize(num_elements);
-        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data());
+        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0,
+                                                      num_elements * offset_element_size, qinfo.offset().data());
     }
 }
 } // namespace
@@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext
 
 CLQuantization CLTensorAllocator::quantization() const
 {
-    return { &_scale, &_offset };
+    return {&_scale, &_offset};
 }
 
 uint8_t *CLTensorAllocator::data()
@@ -127,26 +131,26 @@ const cl::Buffer &CLTensorAllocator::cl_data() const
 void CLTensorAllocator::allocate()
 {
     // Allocate tensor backing memory
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
         // Perform memory allocation
-        if(_ctx == nullptr)
+        if (static_global_cl_allocator != nullptr)
         {
-            auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
-            _memory.set_owned_region(allocate_region(&legacy_ctx, info().total_size(), 0));
+            _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0));
         }
         else
         {
-            _memory.set_owned_region(allocate_region(_ctx->core_runtime_context(), info().total_size(), 0));
+            _memory.set_owned_region(allocate_region(info().total_size(), 0));
         }
     }
     else
     {
+        // Finalize memory management instead
         _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
     }
 
     // Allocate and fill the quantization parameter arrays
-    if(is_data_type_quantized_per_channel(info().data_type()))
+    if (is_data_type_quantized_per_channel(info().data_type()))
     {
         const size_t pad_size = 0;
         populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size);
@@ -171,15 +175,7 @@ Status CLTensorAllocator::import_memory(cl::Buffer buffer)
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
 
-    if(_ctx == nullptr)
-    {
-        auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
-        _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, &legacy_ctx));
-    }
-    else
-    {
-        _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, _ctx->core_runtime_context()));
-    }
+    _memory.set_owned_region(std::make_unique<CLBufferMemoryRegion>(buffer));
 
     info().set_is_resizable(false);
     return Status{};
@@ -194,9 +190,14 @@ void CLTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_mem
     _associated_memory_group = associated_memory_group;
 }
 
+void CLTensorAllocator::set_global_allocator(IAllocator *allocator)
+{
+    static_global_cl_allocator = allocator;
+}
+
 uint8_t *CLTensorAllocator::lock()
 {
-    if(_ctx)
+    if (_ctx)
     {
         return map(_ctx->gpu_scheduler()->queue(), true);
     }
@@ -209,7 +210,7 @@ uint8_t *CLTensorAllocator::lock()
 void CLTensorAllocator::unlock()
 {
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    if(_ctx)
+    if (_ctx)
     {
         unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
     }
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5f2fa7d0e2..0d62fe3afe 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,27 +22,52 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
 #include "support/StringSupport.h"
 
 #include <cerrno>
 #include <fstream>
-#include <iostream>
 #include <limits>
-#include <memory>
-#include <string>
 
 namespace arm_compute
 {
-CLTuner::CLTuner(bool tune_new_kernels)
-    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuner_mode(CLTunerMode::NORMAL)
+CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
+    : real_clEnqueueNDRangeKernel(nullptr),
+      _tuning_params_table(),
+      _lws_table(),
+      _kernel_event(),
+      _tune_new_kernels(tune_new_kernels),
+      _tuning_info(tuning_info)
 {
 }
 
+struct CLTuner::IKernelData
+{
+    virtual ~IKernelData()                                          = default;
+    virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
+};
+struct DefaultKernelData : public CLTuner::IKernelData
+{
+    DefaultKernelData(ITensorPack &tensors) : _tensors{tensors}
+    {
+    }
+    ~DefaultKernelData() override = default;
+    void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override
+    {
+        const bool inject_memory = !_tensors.empty();
+        inject_memory ? kernel.run_op(_tensors, kernel.window(), queue) : kernel.run(kernel.window(), queue);
+    }
+
+private:
+    ITensorPack &_tensors;
+};
+
 bool CLTuner::kernel_event_is_set() const
 {
     return _kernel_event() != nullptr;
@@ -63,11 +88,7 @@ bool CLTuner::tune_new_kernels() const
 
 void CLTuner::set_tuner_mode(CLTunerMode mode)
 {
-    _tuner_mode = mode;
-}
-CLTunerMode CLTuner::get_tuner_mode() const
-{
-    return _tuner_mode;
+    _tuning_info.tuner_mode = mode;
 }
 
 void CLTuner::tune_kernel_static(ICLKernel &kernel)
@@ -77,48 +98,69 @@ void CLTuner::tune_kernel_static(ICLKernel &kernel)
 
 void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
 {
+    ITensorPack pack;
+    tune_kernel_dynamic(kernel, pack);
+}
+
+void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
+{
     // Get the configuration ID from the kernel and append GPU target name and number of available compute units
-    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
+    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" +
+                                  support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
 
     // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned
-    if(kernel.config_id() != arm_compute::default_config_id)
+    if (kernel.config_id() != arm_compute::default_config_id)
     {
-        auto p = _lws_table.find(config_id);
+        auto p = _tuning_params_table.find(config_id);
 
-        if(p == _lws_table.end())
+        if (p == _tuning_params_table.end())
         {
-            if(_tune_new_kernels)
+            if (_tune_new_kernels)
             {
                 // Find the optimal LWS for the kernel
-                cl::NDRange opt_lws = find_optimal_lws(kernel);
+                CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
 
                 // Insert the optimal LWS in the table
-                add_lws_to_table(config_id, opt_lws);
+                add_tuning_params(config_id, opt_tuning_params);
 
                 // Set Local-Workgroup-Size
-                kernel.set_lws_hint(opt_lws);
+                kernel.set_lws_hint(opt_tuning_params.get_lws());
+                if (_tuning_info.tune_wbsm)
+                {
+                    kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
+                }
             }
         }
         else
         {
             // Set Local-Workgroup-Size
-            kernel.set_lws_hint(p->second);
+            kernel.set_lws_hint(p->second.get_lws());
+            if (_tuning_info.tune_wbsm)
+            {
+                kernel.set_wbsm_hint(p->second.get_wbsm());
+            }
         }
     }
 }
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+{
+    DefaultKernelData data{tensors};
 
-void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws)
+    do_tune_kernel_dynamic(kernel, &data);
+}
+
+void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params)
 {
-    _lws_table.emplace(kernel_id, optimal_lws);
+    _tuning_params_table.emplace(kernel_id, optimal_tuning_params);
 }
 
-cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data)
 {
     // Profiling queue
     cl::CommandQueue queue_profiler;
 
     // Extract real OpenCL function to intercept
-    if(real_clEnqueueNDRangeKernel == nullptr)
+    if (real_clEnqueueNDRangeKernel == nullptr)
     {
         real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
     }
@@ -129,7 +171,7 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
     // Check if we can use the OpenCL timer with the default queue
     cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
 
-    if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
     {
         // Set the queue for profiling
         queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
@@ -140,21 +182,23 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
     }
 
     // Start intercepting enqueues:
-    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                              const cl_event * event_wait_list, cl_event * event)
+    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo,
+                              const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list, cl_event *event)
     {
-        if(this->kernel_event_is_set())
+        if (this->kernel_event_is_set())
         {
             // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
             return CL_SUCCESS;
         }
         cl_event tmp;
-        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws,
+                                                            num_events_in_wait_list, event_wait_list, &tmp);
 
         // Set OpenCL event
         this->set_cl_kernel_event(tmp);
 
-        if(event != nullptr)
+        if (event != nullptr)
         {
             //return cl_event from the intercepted call
             clRetainEvent(tmp);
@@ -164,10 +208,19 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
     };
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
-    cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
-
     // Run the kernel with default lws to be used as baseline
-    kernel.run(kernel.window(), queue_profiler);
+    data->do_run(kernel, queue_profiler);
+
+    /// Get the cached gws used by the kernel
+    /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws
+    /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step.
+    /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op()
+    /// Please see COMPMID-5934
+    cl::NDRange gws = kernel.get_cached_gws();
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(
+        arm_compute::logging::LogLevel::INFO,
+        "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(),
+        to_string(gws).c_str());
 
     queue_profiler.finish();
 
@@ -176,28 +229,36 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
     cl_ulong       min_exec_time = end - start;
     _kernel_event                = nullptr;
 
-    cl::NDRange opt_lws = cl::NullRange;
+    CLTuningParams opt_tuning_params(cl::NullRange, 0);
 
-    //Construct the list of LWS values to be tested based on the tuner mode.
-    auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
-    for(size_t i = 0; i < lws_list->size(); ++i)
+    // Construct the list of tuning parameters values to be tested based on the tuner mode.
+    auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
+    for (size_t i = 0; i < tuning_list->size(); ++i)
     {
-        cl::NDRange lws_test    = (*lws_list)[i];
+        CLTuningParams tuning_test = (*tuning_list)[i];
+        // Setting the lws
+        cl::NDRange lws_test    = tuning_test.get_lws();
         auto        x           = lws_test[0];
         auto        y           = lws_test[1];
         auto        z           = lws_test[2];
         const bool  invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
 
-        if(invalid_lws)
+        if (invalid_lws)
         {
             continue;
         }
 
-        //Set the Local-Workgroup-Size
         kernel.set_lws_hint(lws_test);
+        if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+        {
+            cl_int wbsm_test = tuning_test.get_wbsm();
+            kernel.set_wbsm_hint(wbsm_test);
+        }
+        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d",
+                                            to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint());
 
         // Run the kernel
-        kernel.run(kernel.window(), queue_profiler);
+        data->do_run(kernel, queue_profiler);
 
         queue_profiler.finish();
 
@@ -207,28 +268,31 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
         _kernel_event        = nullptr;
 
         // Check the execution time
-        if(diff < min_exec_time)
+        if (diff < min_exec_time)
         {
             min_exec_time = diff;
-            opt_lws       = cl::NDRange(x, y, z);
+            opt_tuning_params.set_lws(tuning_test.get_lws());
+            if (_tuning_info.tune_wbsm)
+            {
+                opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
+            }
         }
     }
 
     // Restore real function
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel;
-
-    return opt_lws;
+    return opt_tuning_params;
 }
 
-void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
+const std::unordered_map<std::string, CLTuningParams> &CLTuner::tuning_params_table() const
 {
-    _lws_table.clear();
-    _lws_table = lws_table;
+    return _tuning_params_table;
 }
 
-const std::unordered_map<std::string, cl::NDRange> &CLTuner::lws_table() const
+void CLTuner::import_tuning_params(const std::unordered_map<std::string, CLTuningParams> &tuning_params_table)
 {
-    return _lws_table;
+    _tuning_params_table.clear();
+    _tuning_params_table = tuning_params_table;
 }
 
 void CLTuner::load_from_file(const std::string &filename)
@@ -236,49 +300,79 @@ void CLTuner::load_from_file(const std::string &filename)
     std::ifstream fs;
     fs.exceptions(std::ifstream::badbit);
     fs.open(filename, std::ios::in);
-    if(!fs.is_open())
+    if (!fs.is_open())
     {
         ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
     }
     std::string line;
-    while(!std::getline(fs, line).fail())
+    bool        header_line = true;
+    while (!std::getline(fs, line).fail())
     {
-        std::istringstream ss(line);
-        std::string        token;
-        if(std::getline(ss, token, ';').fail())
-        {
-            ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
-        }
-        std::string kernel_id = token;
-        cl::NDRange lws(1, 1, 1);
-        for(int i = 0; i < 3; i++)
+        if (header_line)
         {
-            if(std::getline(ss, token, ';').fail())
+            header_line            = false;
+            size_t pos_lws         = line.find("lws");
+            size_t pos_wbsm        = line.find("wbsm");
+            _tuning_info.tune_wbsm = false;
+            if (pos_lws != std::string::npos || pos_wbsm != std::string::npos)
             {
-                ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
+                // The file has in the first line the parameters it has been tuned on
+                if (pos_wbsm != std::string::npos)
+                {
+                    _tuning_info.tune_wbsm = true;
+                }
+                // Once the line with the tuning parameter is read we can
+                // read the next one to start collecting the values
+                if (std::getline(fs, line).fail())
+                {
+                    break;
+                }
             }
-            lws.get()[i] = support::cpp11::stoi(token);
         }
 
-        // If all dimensions are 0: reset to NullRange (i.e nullptr)
-        if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0)
+        CLTuningParams tuning_params;
+        size_t         pos = line.find(";");
+        if (pos == std::string::npos)
         {
-            lws = cl::NullRange;
+            ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
-        add_lws_to_table(kernel_id, lws);
+        std::string kernel_id = line.substr(0, pos);
+        line.erase(0, pos + 1);
+        if (!tuning_params.from_string(_tuning_info, line))
+        {
+            ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
+        }
+        add_tuning_params(kernel_id, tuning_params);
     }
     fs.close();
 }
 
-void CLTuner::save_to_file(const std::string &filename) const
+bool CLTuner::save_to_file(const std::string &filename) const
 {
+    if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
+    {
+        return false;
+    }
     std::ofstream fs;
     fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
     fs.open(filename, std::ios::out);
-    for(auto const &kernel_data : _lws_table)
+    std::string header_string = "";
+    header_string += "lws";
+    if (_tuning_info.tune_wbsm)
+    {
+        if (!header_string.empty())
+        {
+            header_string += " ";
+        }
+        header_string += "wbsm";
+    }
+    fs << header_string << std::endl;
+    for (auto const &kernel_data : _tuning_params_table)
     {
-        fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl;
+        CLTuningParams tun_pams(kernel_data.second);
+        fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
     }
     fs.close();
+    return true;
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index fb8eba8aa4..bc782c3a2c 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,18 +27,21 @@
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
 using namespace arm_compute;
 
 ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
-    : _kernel(),
-      _border_handler(),
-      _ctx(ctx)
+    : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx)
 {
 }
 
+ICLSimpleFunction::~ICLSimpleFunction() = default;
+
 void ICLSimpleFunction::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured");
-    schedule_kernel_on_ctx(_ctx, &_border_handler, false);
+    schedule_kernel_on_ctx(_ctx, _border_handler.get(), false);
     schedule_kernel_on_ctx(_ctx, _kernel.get());
 }
diff --git a/src/runtime/CL/TracePoint.cpp b/src/runtime/CL/TracePoint.cpp
deleted file mode 100644
index 97029f532e..0000000000
--- a/src/runtime/CL/TracePoint.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/TracePoint.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
-#include "utils/TypePrinter.h"
-
-#include <cstdio>
-#include <vector>
-
-namespace arm_compute
-{
-TRACE_TO_STRING(CLPyramid)
-TRACE_TO_STRING(LSTMParams<ICLTensor>)
-TRACE_TO_STRING(CLCoordinates2DArray)
-CONST_PTR_CLASS(CLPyramid)
-CONST_PTR_CLASS(LSTMParams<ICLTensor>)
-CONST_PTR_CLASS(CLCoordinates2DArray)
-} // namespace arm_compute
diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp
index 5e22dfd4eb..294396c28a 100644
--- a/src/runtime/CL/Utils.cpp
+++ b/src/runtime/CL/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/CL/Utils.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
@@ -33,20 +35,20 @@ namespace arm_compute
 void restore_program_cache_from_file(const std::string &filename)
 {
     std::ifstream cache_file(filename, std::ios::binary);
-    if(cache_file.is_open())
+    if (cache_file.is_open())
     {
-        if(!CLScheduler::get().is_initialised())
+        if (!CLScheduler::get().is_initialised())
         {
             arm_compute::CLScheduler::get().default_init();
         }
 
-        while(!cache_file.eof())
+        while (!cache_file.eof())
         {
             size_t name_len   = 0;
             size_t binary_len = 0;
             cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
             cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
-            if(name_len == 0 || binary_len == 0)
+            if (name_len == 0 || binary_len == 0)
             {
                 break;
             }
@@ -58,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename)
             tmp.resize(binary_len);
             cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
             cl::Context             context = arm_compute::CLScheduler::get().context();
-            cl::Program::Binaries   binaries{ binary };
+            cl::Program::Binaries   binaries{binary};
             std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
             cl::Program             program(context, devices, binaries);
             program.build();
@@ -70,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename)
 
 void save_program_cache_to_file(const std::string &filename)
 {
-    if(CLScheduler::get().is_initialised())
+    if (CLScheduler::get().is_initialised())
     {
         std::ofstream cache_file(filename, std::ios::binary);
-        if(cache_file.is_open())
+        if (cache_file.is_open())
         {
-            for(const auto &it : CLKernelLibrary::get().get_built_programs())
+            for (const auto &it : CLKernelLibrary::get().get_built_programs())
             {
                 std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
                 ARM_COMPUTE_ERROR_ON(binaries.size() != 1);
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
deleted file mode 100644
index 492c54e4d3..0000000000
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
-
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
-    k->configure(compile_context, input1, input2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
deleted file mode 100644
index a81d1d042b..0000000000
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
-
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, accum);
-}
-
-void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>();
-    k->configure(compile_context, input, accum);
-    _kernel = std::move(k);
-}
-
-void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
-}
-
-void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>();
-    k->configure(compile_context, input, alpha, accum);
-    _kernel = std::move(k);
-}
-
-void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
-}
-
-void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>();
-    k->configure(compile_context, input, shift, accum);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 989603a9df..c035644e4a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,32 +23,63 @@
  */
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
-#include "support/MemorySupport.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClActivation.h"
 
 namespace arm_compute
 {
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
-    : ICLSimpleFunction(ctx)
+struct CLActivationLayer::Impl
+{
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    CLRuntimeContext                     *ctx{nullptr};
+    std::unique_ptr<opencl::ClActivation> op{nullptr};
+};
+
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
+    _impl->ctx = ctx;
 }
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&)            = default;
+CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
+CLActivationLayer::~CLActivationLayer()                               = default;
 
 void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
 }
 
-void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(const CLCompileContext &compile_context,
+                                  ICLTensor              *input,
+                                  ICLTensor              *output,
+                                  ActivationLayerInfo     act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _impl->src = input;
+    _impl->dst = output == nullptr ? input : output;
+
+    _impl->op = std::make_unique<opencl::ClActivation>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
+}
+
+Status
+CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
-    k->configure(compile_context, input, output, act_info);
-    _kernel = std::move(k);
+    return opencl::ClActivation::validate(input, output, act_info);
 }
 
-Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+void CLActivationLayer::run()
 {
-    return CLActivationLayerKernel::validate(input, output, act_info);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index 5b4c694f33..f9bbd31e8a 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,80 +27,65 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
 CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
+    : _memory_group(std::move(memory_manager)),
+      _not_reshaped_output(),
+      _arg_min_max_kernel(),
+      _reshape(),
+      _reduction_axis()
 {
 }
 
-Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
+
+Status
+CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid reduction operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    const unsigned int num_of_stages = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
     DataType   output_data_type = DataType::S32;
     TensorInfo not_reshaped_output;
     const auto input_num_channles = input->num_channels();
     const auto input_qinfo        = input->quantization_info();
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         output_data_type                       = output->data_type();
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
     auto shape_before_reshape = input->tensor_shape();
     shape_before_reshape.set(axis, 1);
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
     initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
 
-    if(num_of_stages == 1)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, &not_reshaped_output, axis, op));
-    }
-    else
-    {
-        // Create temporary tensor infos
-        std::vector<TensorInfo> sums_vector(num_of_stages - 1);
-
-        // Create intermediate tensor info
-        TensorShape shape{ input->tensor_shape() };
-
-        for(unsigned int i = 0; i < num_of_stages - 1; i++)
-        {
-            shape.set(0, ceil(shape.x() / 128.f));
-            sums_vector[i].set_data_type(input->data_type());
-            sums_vector[i].set_tensor_shape(shape);
-            sums_vector[i].set_num_channels(input->num_channels());
-        }
-
-        // Validate ReductionOperation only on first kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, nullptr, &sums_vector[0], axis, op));
-
-        // Validate ReductionOperation on intermediate stages
-        for(unsigned int i = 1; i < num_of_stages - 1; ++i)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op));
-        }
-
-        // Validate ReductionOperation on the last stage
-        const unsigned int last_stage = num_of_stages - 1;
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
-    }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &not_reshaped_output, axis, op));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
     return Status{};
 }
 
@@ -109,53 +94,43 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
 }
 
-void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+void CLArgMinMaxLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 int                       axis,
+                                 ICLTensor                *output,
+                                 const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _num_of_stages  = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
-    _reduction_axis = axis;
+    ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
 
-    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-    DataType          output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    _reduction_axis = axis;
 
-    // Configure reduction operation kernels
-    _reduction_kernels_vector.resize(_num_of_stages);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+    DataType output_data_type =
+        (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
+
+    TensorShape not_reshaped_output_shape{input->info()->tensor_shape()};
+    not_reshaped_output_shape.set(axis, 1);
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+                                                         ->clone()
+                                                         ->set_tensor_shape(not_reshaped_output_shape)
+                                                         .set_data_type(output_data_type)
+                                                         .reset_padding()
+                                                         .set_is_resizable(true));
+
+    _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
+    _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);
 
     _memory_group.manage(&_not_reshaped_output);
-    // Create temporary tensors
-    if(_num_of_stages == 1)
-    {
-        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op);
-    }
-    else
-    {
-        _results_vector.resize(_num_of_stages - 1);
-        TensorShape shape{ input->info()->tensor_shape() };
-        for(unsigned int i = 0; i < _num_of_stages - 1; i++)
-        {
-            shape.set(0, ceil(shape.x() / 128.f));
-            _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
-        }
-
-        // Apply ReductionOperation only on first kernel
-        _memory_group.manage(&_results_vector[0]);
-        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op);
-
-        // Apply ReductionOperation on intermediate stages
-        for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
-        {
-            _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op);
-            _results_vector[i - 1].allocator()->allocate();
-        }
-
-        // Apply ReductionOperation on the last stage
-        const unsigned int last_stage = _num_of_stages - 1;
-        _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
-        _results_vector[last_stage - 1].allocator()->allocate();
-    }
-    _reshape_kernel.configure(compile_context, &_not_reshaped_output, output);
+
+    _reshape.configure(compile_context, &_not_reshaped_output, output);
     _not_reshaped_output.allocator()->allocate();
 }
 
@@ -163,10 +138,7 @@ void CLArgMinMaxLayer::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    for(unsigned int i = 0; i < _num_of_stages; ++i)
-    {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
-    }
-    CLScheduler::get().enqueue(_reshape_kernel, false);
+    CLScheduler::get().enqueue(*_arg_min_max_kernel, false);
+    _reshape.run();
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 9fc51136b3..0c371c4171 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,35 +30,58 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
+namespace arm_compute
+{
 CLBatchNormalizationLayer::CLBatchNormalizationLayer()
-    : _norm_kernel()
+    : _norm_kernel(std::make_unique<CLBatchNormalizationLayerKernel>())
 {
 }
 
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
+
+void CLBatchNormalizationLayer::configure(ICLTensor          *input,
+                                          ICLTensor          *output,
+                                          const ICLTensor    *mean,
+                                          const ICLTensor    *var,
+                                          const ICLTensor    *beta,
+                                          const ICLTensor    *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                          const ICLTensor *gamma, float epsilon,
-                                          ActivationLayerInfo act_info)
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                          ICLTensor              *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *var,
+                                          const ICLTensor        *beta,
+                                          const ICLTensor        *gamma,
+                                          float                   epsilon,
+                                          ActivationLayerInfo     act_info)
 {
-    _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
+    ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
+    _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *var,
-                                           const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
     return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 void CLBatchNormalizationLayer::run()
 {
-    CLScheduler::get().enqueue(_norm_kernel, true);
+    CLScheduler::get().enqueue(*_norm_kernel, true);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index 0a2ae2a6e0..a3798daf61 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,44 +30,66 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
-CLBatchToSpaceLayer::CLBatchToSpaceLayer()
-    : _batch_to_space_kernel()
+namespace arm_compute
+{
+CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
 {
 }
 
+CLBatchToSpaceLayer::~CLBatchToSpaceLayer() = default;
+
 void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
+    ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
+    _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    ICLTensor              *output)
 {
-    _batch_to_space_kernel.configure(compile_context, input, block_shape, output);
+    ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
+    _batch_to_space_kernel->configure(compile_context, input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(
+    const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    int32_t                 block_shape_x,
+                                    int32_t                 block_shape_y,
+                                    ICLTensor              *output,
+                                    const CropInfo         &crop_info)
 {
-    _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output);
+    ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output);
+    _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
-    return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+    return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }
 
 void CLBatchToSpaceLayer::run()
 {
-    CLScheduler::get().enqueue(_batch_to_space_kernel, true);
+    CLScheduler::get().enqueue(*_batch_to_space_kernel, true);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 1fa80f0a24..7bfd0e3677 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,21 +23,26 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>();
-    k->configure(compile_context, input1, input2, output);
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<CLBitwiseKernel>();
+    k->configure(compile_context, input1, input2, output, BitwiseOperation::AND);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 46595191a0..9763915c02 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -37,7 +37,9 @@ void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>();
-    k->configure(compile_context, input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output);
+    auto k = std::make_unique<CLBitwiseKernel>();
+    k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index 8431140cb8..dd3171b982 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,21 +23,26 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseOr::configure(const CLCompileContext &compile_context,
+                            const ICLTensor        *input1,
+                            const ICLTensor        *input2,
+                            ICLTensor              *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>();
-    k->configure(compile_context, input1, input2, output);
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<CLBitwiseKernel>();
+    k->configure(compile_context, input1, input2, output, BitwiseOperation::OR);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index 0e0e7f2028..5bee4b37ec 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,21 +23,26 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseXor::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>();
-    k->configure(compile_context, input1, input2, output);
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<CLBitwiseKernel>();
+    k->configure(compile_context, input1, input2, output, BitwiseOperation::XOR);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 55bcde749c..76e626fd75 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,37 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const CLCompileContext         &compile_context,
+                                       const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
+    ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
+
     // Configure Bounding Box kernel
-    auto k = arm_compute::support::cpp14::make_unique<CLBoundingBoxTransformKernel>();
+    auto k = std::make_unique<CLBoundingBoxTransformKernel>();
     k->configure(compile_context, boxes, pred_boxes, deltas, info);
     _kernel = std::move(k);
 }
 
-Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
deleted file mode 100644
index 72c822197c..0000000000
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
-
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
deleted file mode 100644
index 0c8d3532aa..0000000000
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(),
-      _gradient(),
-      _border_mag_gradient(),
-      _non_max_suppr(),
-      _edge_trace(),
-      _gx(),
-      _gy(),
-      _mag(),
-      _phase(),
-      _nonmax(),
-      _visited(),
-      _recorded(),
-      _l1_list_counter(),
-      _l1_stack(),
-      _output(nullptr)
-{
-}
-
-void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
-                            uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value);
-}
-
-void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
-                            BorderMode border_mode,
-                            uint8_t    constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
-    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
-    ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr));
-
-    _output = output;
-
-    const unsigned int L1_hysteresis_stack_size = 8;
-    const TensorShape  shape                    = input->info()->tensor_shape();
-
-    TensorInfo gradient_info;
-    TensorInfo info;
-
-    // Initialize images
-    if(gradient_size < 7)
-    {
-        gradient_info.init(shape, 1, arm_compute::DataType::S16);
-        info.init(shape, 1, arm_compute::DataType::U16);
-    }
-    else
-    {
-        gradient_info.init(shape, 1, arm_compute::DataType::S32);
-        info.init(shape, 1, arm_compute::DataType::U32);
-    }
-
-    _gx.allocator()->init(gradient_info);
-    _gy.allocator()->init(gradient_info);
-    _mag.allocator()->init(info);
-    _nonmax.allocator()->init(info);
-
-    TensorInfo info_u8(shape, 1, arm_compute::DataType::U8);
-    _phase.allocator()->init(info_u8);
-    _l1_list_counter.allocator()->init(info_u8);
-
-    TensorInfo info_u32(shape, 1, arm_compute::DataType::U32);
-    _visited.allocator()->init(info_u32);
-    _recorded.allocator()->init(info_u32);
-
-    TensorShape shape_l1_stack = input->info()->tensor_shape();
-    shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size);
-    TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
-    _l1_stack.allocator()->init(info_s32);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Configure/Init sobelNxN
-    if(gradient_size == 3)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 5)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 7)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
-        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_VAR("Gradient size %d not supported", gradient_size);
-    }
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Configure gradient
-    _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
-
-    // Allocate intermediate buffers
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Configure non-maxima suppression
-    _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
-
-    // Allocate intermediate buffers
-    _phase.allocator()->allocate();
-
-    // Fill border around magnitude image as non-maxima suppression will access
-    // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
-
-    // Allocate intermediate buffers
-    _mag.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_visited);
-    _memory_group.manage(&_recorded);
-    _memory_group.manage(&_l1_stack);
-    _memory_group.manage(&_l1_list_counter);
-
-    // Configure edge tracing
-    _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
-
-    // Allocate intermediate buffers
-    _visited.allocator()->allocate();
-    _recorded.allocator()->allocate();
-    _l1_stack.allocator()->allocate();
-    _l1_list_counter.allocator()->allocate();
-    _nonmax.allocator()->allocate();
-}
-
-void CLCannyEdge::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run sobel
-    _sobel->run();
-
-    // Run phase and magnitude calculation
-    CLScheduler::get().enqueue(_gradient, false);
-
-    // Fill border before non-maxima suppression. Nop for border mode undefined.
-    CLScheduler::get().enqueue(_border_mag_gradient, false);
-
-    // Run non max suppresion
-    _nonmax.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(_non_max_suppr, false);
-
-    // Clear temporary structures and run edge trace
-    _output->clear(CLScheduler::get().queue());
-    _visited.clear(CLScheduler::get().queue());
-    _recorded.clear(CLScheduler::get().queue());
-    _l1_list_counter.clear(CLScheduler::get().queue());
-    _l1_stack.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(_edge_trace, true);
-}
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 7048a79bc5..42ec8f7ee0 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,27 +23,60 @@
  */
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct CLCast::Impl
+{
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
+};
+
+CLCast::CLCast() : _impl(std::make_unique<Impl>())
+{
+}
+CLCast::CLCast(CLCast &&)            = default;
+CLCast &CLCast::operator=(CLCast &&) = default;
+CLCast::~CLCast()                    = default;
+
 void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
 }
 
-void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+void CLCast::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       ConvertPolicy           policy)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
-    k->configure(compile_context, input, output, policy, 0);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output, policy);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClCast>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
 }
 
 Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
 {
-    return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+    return opencl::ClCast::validate(input, output, policy);
+}
+
+void CLCast::run()
+{
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
deleted file mode 100644
index 249212e03b..0000000000
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
-
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
-}
-
-void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
-    k->configure(compile_context, plane0, plane1, plane2, plane3, output);
-    _kernel = std::move(k);
-}
-
-void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
-}
-
-void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
-    k->configure(compile_context, plane0, plane1, plane2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
deleted file mode 100644
index 019e0a7a90..0000000000
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
-
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
-    k->configure(compile_context, input, channel, output);
-    _kernel = std::move(k);
-}
-
-void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
-}
-
-void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
-    k->configure(compile_context, input, channel, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 93ab7c7ddf..1ee4789816 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 namespace arm_compute
 {
@@ -34,9 +35,13 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output,
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output,
+                                      unsigned int            num_groups)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
+    auto k = std::make_unique<CLChannelShuffleLayerKernel>();
     k->configure(compile_context, input, output, num_groups);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
deleted file mode 100644
index b8e597751b..0000000000
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
-
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
-
-void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 8d5ec3571d..2f54371e88 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,11 @@
 #include "arm_compute/runtime/CL/functions/CLComparison.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
@@ -35,24 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparison::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output,
+                             ComparisonOperation     operation)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation);
+    auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, operation);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
-Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparison::validate(const ITensorInfo  *input1,
+                              const ITensorInfo  *input2,
+                              const ITensorInfo  *output,
+                              ComparisonOperation operation)
 {
     return CLComparisonKernel::validate(input1, input2, output, operation);
 }
@@ -64,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC
 }
 
 template <ComparisonOperation COP>
-void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
+                                        ICLTensor              *input1,
+                                        ICLTensor              *input2,
+                                        ICLTensor              *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+    auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, COP);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
 template <ComparisonOperation COP>
-Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status
+CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     return CLComparisonKernel::validate(input1, input2, output, COP);
 }
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index e97256713f..9df1c34593 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,238 +23,77 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
-CLConcatenateLayer::CLConcatenateLayer()
-    : _concat_kernels(),
-      _num_inputs(0),
-      _axis(Window::DimX)
+struct CLConcatenateLayer::Impl
 {
-}
-
-void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+    std::vector<const ICLTensor *>         srcs{};
+    ICLTensor                             *dst{nullptr};
+    unsigned int                           num_inputs{0};
+    unsigned int                           axis{0};
+    std::unique_ptr<opencl::ClConcatenate> op{nullptr};
+};
+
+CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
-    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
 }
 
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
-{
-    configure_internal(compile_context, std::move(inputs_vector), output, axis);
-}
+CLConcatenateLayer::CLConcatenateLayer(CLConcatenateLayer &&) = default;
 
-void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
-}
+CLConcatenateLayer &CLConcatenateLayer::operator=(CLConcatenateLayer &&) = default;
 
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
-{
-    configure_internal(compile_context, std::move(inputs_vector), output, axis);
-}
+CLConcatenateLayer::~CLConcatenateLayer() = default;
 
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    return validate_internal(inputs_vector, output, axis);
-}
-
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
 {
-    return validate_internal(inputs_vector, output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
 }
 
-template <typename TensorType>
-void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_context, std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(const CLCompileContext         &compile_context,
+                                   std::vector<const ICLTensor *> &inputs_vector,
+                                   ICLTensor                      *output,
+                                   size_t                          axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
-    _axis       = axis;
-    _num_inputs = inputs_vector.size();
+    ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis);
 
-    std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
-    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+    _impl->srcs       = inputs_vector;
+    _impl->dst        = output;
+    _impl->axis       = axis;
+    _impl->num_inputs = inputs_vector.size();
+    _impl->op         = std::make_unique<opencl::ClConcatenate>();
 
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
-
-    unsigned int offset = 0;
-    switch(_axis)
+    std::vector<ITensorInfo *> inputs_vector_info;
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
-        case Window::DimX:
-        {
-            switch(_num_inputs)
-            {
-                case 2:
-                {
-                    // Configure WidthConcatenate2Tensors kernel
-                    auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>();
-                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), output);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                case 4:
-                {
-                    // Configure WidthConcatenate4Tensors kernel
-                    auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
-                    _concat_kernels.emplace_back(std::move(kernel));
-                    break;
-                }
-                default:
-                {
-                    // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
-                    {
-                        auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
-                        kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                        offset += inputs_vector.at(i)->info()->dimension(_axis);
-                        _concat_kernels.emplace_back(std::move(kernel));
-                    }
-                    break;
-                }
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        case 3:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>();
-                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(_axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
+        ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
+    _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
 }
 
-template <typename TensorInfoType>
-Status CLConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
-    const unsigned int num_inputs = inputs_vector.size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
-
-    unsigned int offset = 0;
-    switch(axis)
-    {
-        case Window::DimX:
-        {
-            switch(num_inputs)
-            {
-                case 2:
-                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], output));
-                    break;
-                case 4:
-                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
-                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], output));
-                    break;
-                default:
-                    // Validate generic case of WidthConcatenate kernel
-                    for(const auto &input : inputs_vector)
-                    {
-                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-                        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, offset, output));
-                        offset += input->dimension(axis);
-                    }
-                    break;
-            }
-            break;
-        }
-        case Window::DimY:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLHeightConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        case Window::DimZ:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        case 3:
-        {
-            for(const auto &input : inputs_vector)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(CLBatchConcatenateLayerKernel::validate(input, offset, output));
-                offset += input->dimension(axis);
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-
-    if(output->total_size() != 0)
-    {
-        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
-    }
-
-    return Status{};
+    return opencl::ClConcatenate::validate(inputs_vector, output, axis);
 }
 
 void CLConcatenateLayer::run()
 {
-    for(auto &kernel : _concat_kernels)
+    ITensorPack pack;
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
-        CLScheduler::get().enqueue(*kernel, true);
+        pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp
new file mode 100644
index 0000000000..9d1b368f72
--- /dev/null
+++ b/src/runtime/CL/functions/CLConv3D.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConv3D.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/gpu/cl/operators/ClDirectConv3d.h"
+
+namespace arm_compute
+{
+using namespace arm_compute::experimental;
+
+struct CLConv3D::Impl
+{
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv3d> op{nullptr};
+};
+
+CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLConv3D::~CLConv3D() = default;
+
+void CLConv3D::configure(const ICLTensor  *src,
+                         const ICLTensor  *weights,
+                         const ICLTensor  *biases,
+                         ICLTensor        *dst,
+                         const Conv3dInfo &conv3d_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info);
+}
+
+void CLConv3D::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *src,
+                         const ICLTensor        *weights,
+                         const ICLTensor        *biases,
+                         ICLTensor              *dst,
+                         const Conv3dInfo       &conv3d_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(
+        src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
+
+    _impl->src     = src;
+    _impl->weights = weights;
+    _impl->biases  = biases;
+    _impl->dst     = dst;
+
+    _impl->op = std::make_unique<opencl::ClDirectConv3d>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(),
+                         _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
+}
+
+Status CLConv3D::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
+{
+    return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info);
+}
+
+void CLConv3D::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 68c0fb6ebf..2298f2a669 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,64 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+
 namespace arm_compute
 {
-void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+struct CLConvertFullyConnectedWeights::Impl
+{
+    const ICLTensor                                        *src{nullptr};
+    ICLTensor                                              *dst{nullptr};
+    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr};
+};
+CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
+{
+}
+CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default;
+
+void CLConvertFullyConnectedWeights::configure(const ICLTensor   *input,
+                                               ICLTensor         *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
 }
 
-void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context,
+                                               const ICLTensor        *input,
+                                               ICLTensor              *output,
+                                               const TensorShape      &original_input_shape,
+                                               DataLayout              data_layout)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>();
-    k->configure(compile_context, input, output, original_input_shape, data_layout);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClConvertFullyConnectedWeights>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
-    return CLConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+    return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
-} // namespace arm_compute
-\ No newline at end of file
+
+void CLConvertFullyConnectedWeights::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
deleted file mode 100644
index 2b0d7d5e53..0000000000
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLConvolution.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
-}
-
-void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                 uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
-    k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
-{
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
-}
-
-template <unsigned int matrix_size>
-void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    std::array<int16_t, matrix_size> conv_col{ 0 };
-    std::array<int16_t, matrix_size> conv_row{ 0 };
-    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
-
-    if(_is_separable)
-    {
-        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
-        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp);
-
-        if(scale == 0)
-        {
-            scale = calculate_matrix_scale(conv, matrix_size);
-        }
-
-        _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
-        _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-
-        // Allocate intermediate buffer
-        _tmp.allocator()->allocate();
-    }
-    else
-    {
-        _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
-    }
-}
-
-template <unsigned int matrix_size>
-void                   CLConvolutionSquare<matrix_size>::run()
-{
-    CLScheduler::get().enqueue(_border_handler);
-
-    if(_is_separable)
-    {
-        MemoryGroupResourceScope scope_mg(_memory_group);
-
-        CLScheduler::get().enqueue(_kernel_hor, false);
-        CLScheduler::get().enqueue(_kernel_vert);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_kernel);
-    }
-}
-
-template class arm_compute::CLConvolutionSquare<5>;
-template class arm_compute::CLConvolutionSquare<7>;
-template class arm_compute::CLConvolutionSquare<9>;
-
-void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value);
-}
-
-void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale,
-                                       BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
-    k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index b6e1413f7a..7767b45a01 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,113 +23,149 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
 
-#include <cmath>
-#include <memory>
-#include <tuple>
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
-
-CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _function()
+using namespace arm_compute::experimental;
+struct CLConvolutionLayer::Impl
+{
+    MemoryGroup                          memory_group{};
+    std::shared_ptr<IMemoryManager>      memory_manager{};
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
+    ITensorPack                          run_pack{};
+    ITensorPack                          prep_pack{};
+    WorkspaceData<CLTensor>              workspace{};
+    experimental::MemoryRequirements     aux_mem_req{};
+    std::unique_ptr<IFunction>           func{nullptr};
+};
+
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
+    _impl->memory_manager = std::move(memory_manager);
 }
 
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+CLConvolutionLayer::~CLConvolutionLayer() = default;
+
+void CLConvolutionLayer::configure(ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, enable_fast_math, num_groups);
 }
 
-void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
+
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
-                                                      weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
+    switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
+                                                     weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
-        {
-            ARM_COMPUTE_ERROR_ON(num_groups != 1);
-            auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
-            f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
-            _function = std::move(f);
-            break;
-        }
         case ConvolutionMethod::DIRECT:
-        {
-            ARM_COMPUTE_ERROR_ON(num_groups != 1);
-            auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
-            f->configure(compile_context, input, weights, biases, output, conv_info, act_info);
-            _function = std::move(f);
-            break;
-        }
+        case ConvolutionMethod::INDIRECT:
         case ConvolutionMethod::GEMM:
         {
-            auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
-            f->configure(compile_context, input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
-            _function = std::move(f);
+            auto f = std::make_unique<opencl::ClConv2d>();
+            f->configure(compile_context, input->info(), weights->info(),
+                         ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+            _impl->op = std::move(f);
             break;
         }
         case ConvolutionMethod::FFT:
         {
-            auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager);
-            f->configure(compile_context, input, weights, biases, output, conv_info, act_info);
-            _function = std::move(f);
+            auto f = std::make_unique<CLFFTConvolutionLayer>(_impl->memory_manager);
+            f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
+            _impl->func = std::move(f);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
     }
+
+    if (_impl->op)
+    {
+        _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+        _impl->aux_mem_req  = _impl->op->workspace();
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    }
 }
 
-Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CLConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
-    const GPUTarget gpu_target = CLScheduler::get().target();
+    const GPUTarget  gpu_target  = CLScheduler::get().target();
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
+    switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
-        {
-            //Validate Winograd
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLWinogradConvolutionLayer is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
-            break;
-        }
         case ConvolutionMethod::DIRECT:
-        {
-            // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "Grouping (num_groups != 1) with CLDirectConvolutionLayer is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
-            break;
-        }
+        case ConvolutionMethod::INDIRECT:
         case ConvolutionMethod::GEMM:
         {
-            // Validate gemm-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
             break;
         }
         case ConvolutionMethod::FFT:
         {
             // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info,
+                                                                        act_info, enable_fast_math));
             break;
         }
         default:
@@ -140,88 +176,48 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
     return Status{};
 }
 
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const ActivationLayerInfo &act_info,
+                                                             const GPUTarget            gpu_target,
+                                                             const Size2D              &dilation,
+                                                             bool                       enable_fast_math)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_UNUSED(weights_info);
-    ARM_COMPUTE_UNUSED(gpu_target);
-
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
-    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
-    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
-    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
-
-    const std::vector<ConfigurationMethod> known_configs =
-    {
-        // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
-        // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
-        // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
-        // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
-        // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
-        // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
-    };
-
-    const auto find_config = [&](ConfigurationMethod c)
-    {
-        const ConvolutionConfiguration config      = c.first;
-        const PadStrideInfo            info        = std::get<3>(config);
-        const DataLayout               data_layout = std::get<4>(config);
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1);
+    return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target);
+}
 
-        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == input->data_layout());
-    };
+void CLConvolutionLayer::run()
+{
+    prepare();
 
-    std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
-    {
-        return (*found).second;
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(dilation != Size2D(1U, 1U))
+    if (_impl->func)
     {
-        return ConvolutionMethod::GEMM;
+        _impl->func->run();
     }
     else
     {
-        // SRGAN
-        if((input->dimension(idx_h) > 720U) && (output->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-           && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
-        {
-            return ConvolutionMethod::DIRECT;
-        }
-        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
-        {
-            return ConvolutionMethod::FFT;
-        }
-        if(input->dimension(idx_c) < 16)
-        {
-            return ConvolutionMethod::GEMM;
-        }
-        return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+        _impl->op->run(_impl->run_pack);
     }
 }
 
-void CLConvolutionLayer::run()
-{
-    prepare();
-    _function->run();
-}
-
 void CLConvolutionLayer::prepare()
 {
-    _function->prepare();
+    if (_impl->func)
+    {
+        _impl->func->prepare();
+    }
+    else
+    {
+        _impl->op->prepare(_impl->prep_pack);
+
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries(_impl->aux_mem_req, _impl->workspace);
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 4c5d62a82c..a4f2b0634f 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,31 +23,60 @@
  */
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCopy.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+struct CLCopy::Impl
+{
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCopy> op{nullptr};
+};
+
+CLCopy::CLCopy() : _impl(std::make_unique<Impl>())
+{
+}
+CLCopy::CLCopy(CLCopy &&)            = default;
+CLCopy &CLCopy::operator=(CLCopy &&) = default;
+CLCopy::~CLCopy()                    = default;
 
-void CLCopy::configure(ICLTensor *input, ICLTensor *output)
+void CLCopy::configure(ICLTensor *input, ICLTensor *output, Window *dst_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, dst_window);
+}
+
+void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_LOG_PARAMS(input, output, dst_window);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClCopy>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), dst_window);
 }
 
-void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output)
+Status CLCopy::validate(const ITensorInfo *input, const ITensorInfo *output, Window *dst_window)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
+    return opencl::ClCopy::validate(input, output, dst_window);
 }
 
-Status CLCopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+void CLCopy::run()
 {
-    return CLCopyKernel::validate(input, output);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
new file mode 100644
index 0000000000..fc29c43827
--- /dev/null
+++ b/src/runtime/CL/functions/CLCrop.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCrop.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCrop.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+struct CLCrop::Impl
+{
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCrop> op{nullptr};
+};
+
+CLCrop::CLCrop() : _impl(std::make_unique<Impl>())
+{
+}
+CLCrop::CLCrop(CLCrop &&)            = default;
+CLCrop &CLCrop::operator=(CLCrop &&) = default;
+CLCrop::~CLCrop()                    = default;
+
+void CLCrop::configure(const ICLTensor *src,
+                       ICLTensor       *dst,
+                       Coordinates2D    start,
+                       Coordinates2D    end,
+                       uint32_t         batch_index,
+                       float            extrapolation_value,
+                       Window          *dst_window)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value,
+              dst_window);
+}
+
+void CLCrop::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *src,
+                       ICLTensor              *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
+
+    _impl->src = src;
+    _impl->dst = dst;
+
+    _impl->op = std::make_unique<opencl::ClCrop>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index,
+                         extrapolation_value, dst_window);
+}
+
+Status CLCrop::validate(const ITensorInfo *input,
+                        const ITensorInfo *output,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
+{
+    return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window);
+}
+
+void CLCrop::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 17fc80e146..821412b149 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,25 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <cstddef>
 
 namespace arm_compute
 {
 namespace
 {
-inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+inline void configure_crop(const ICLTensor *input,
+                           ICLTensor       *crop_boxes,
+                           ICLTensor       *box_ind,
+                           ICLTensor       *output,
+                           uint32_t         crop_box_ind,
+                           Coordinates     &start,
+                           Coordinates     &end,
+                           uint32_t        &batch_index)
 {
     batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
 
@@ -45,28 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen
     // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
     start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
                         std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
-                      std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
+    end   = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                        std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1,
+                                static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
     output->info()->set_tensor_shape(out_shape);
 }
 } // namespace
 
 CLCropResize::CLCropResize()
-    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_kernels()
+    : _input(nullptr),
+      _boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _scale(),
+      _copy(),
+      _crop_results(),
+      _scaled_results(),
+      _internal_functions()
 {
 }
 
-Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+CLCropResize::~CLCropResize() = default;
+
+Status CLCropResize::validate(const ITensorInfo  *input,
+                              ITensorInfo        *boxes,
+                              ITensorInfo        *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCropKernel::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1},
+                                                 input->dimension(3) - 1, extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -76,19 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen
     return Status{};
 }
 
-void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const ICLTensor    *input,
+                             ICLTensor          *boxes,
+                             ICLTensor          *box_ind,
+                             ICLTensor          *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method,
+              extrapolation_value);
 }
 
-void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *boxes,
+                             ICLTensor              *box_ind,
+                             ICLTensor              *output,
+                             Coordinates2D           crop_size,
+                             InterpolationPolicy     method,
+                             float                   extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
-    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
+    ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
-    TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+    TensorShape output_shape =
+        TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
     auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -103,26 +150,26 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
 
     // For each crop box:
     // - The initial cropped image is produced as specified by boxes[i] from the 3D image input[box_ind[i]].
-    //   Possibly using a CLCropKernel and up to four CLMemsetKernels.
+    //   Possibly using a CLCrop and up to four CLFills.
     // - A tensor is required to hold this initial cropped image.
     // - A scale function is used to resize the cropped image to the size specified by crop_size.
     // - A tensor is required to hold the final scaled image before it is copied into the 4D output
-    //   that will hold all final cropped and scaled 3D images using CLCopyKernel.
+    //   that will hold all final cropped and scaled 3D images using CLCopy.
 
     // The contents of _boxes and _box_ind are required to calculate the shape
     // of the initial cropped image and thus are required to configure the
     // kernels used for cropping and scaling.
     _boxes->map(CLScheduler::get().queue());
     _box_ind->map(CLScheduler::get().queue());
-    for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
+    for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
     {
-        auto       crop_tensor = support::cpp14::make_unique<CLTensor>();
+        auto       crop_tensor = std::make_unique<CLTensor>();
         TensorInfo crop_result_info(1, DataType::F32);
         crop_result_info.set_data_layout(DataLayout::NHWC);
         crop_tensor->allocator()->init(crop_result_info);
         _crop_results.emplace_back(std::move(crop_tensor));
 
-        auto       scale_tensor = support::cpp14::make_unique<CLTensor>();
+        auto       scale_tensor = std::make_unique<CLTensor>();
         TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
         scaled_result_info.set_data_layout(DataLayout::NHWC);
         scale_tensor->allocator()->init(scaled_result_info);
@@ -134,15 +181,17 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         Coordinates end{};
         configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
 
-        auto scale_kernel = support::cpp14::make_unique<CLScale>();
-        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+        auto scale_kernel = std::make_unique<CLScale>();
+        scale_kernel->configure(
+            compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(),
+            ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT});
         _scale.emplace_back(std::move(scale_kernel));
 
         Window win = calculate_max_window(*_output->info());
         win.set(3, Window::Dimension(num_box, num_box + 1, 1));
 
-        auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
-        copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, PaddingList(), &win);
+        auto copy_kernel = std::make_unique<CLCopy>();
+        copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, &win);
         _copy.emplace_back(std::move(copy_kernel));
 
         _crop_results[num_box]->allocator()->allocate();
@@ -151,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         bool is_width_flipped  = end[0] < start[0];
         bool is_height_flipped = end[1] < start[1];
         /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+        std::array<int32_t, 2> rows_out_of_bounds{0};
         /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> cols_out_of_bounds{ 0 };
-        if(is_height_flipped)
+        std::array<int32_t, 2> cols_out_of_bounds{0};
+        if (is_height_flipped)
         {
-            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
-            rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(start[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
+            rows_out_of_bounds[1] =
+                end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                           : 0;
         }
         else
         {
-            rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
-            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+            rows_out_of_bounds[0] =
+                start[1] < 0
+                    ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                    : 0;
+            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(end[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
         }
-        if(is_width_flipped)
+        if (is_width_flipped)
         {
-            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
-            cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(start[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
+            cols_out_of_bounds[1] =
+                end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                           : 0;
         }
         else
         {
-            cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
-            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+            cols_out_of_bounds[0] =
+                start[0] < 0
+                    ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                    : 0;
+            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(end[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
         }
 
         Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
@@ -195,64 +266,86 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
         // with the extrapolation value using memset.
         // First for the rows before the in bounds rows.
-        if(rows_out_of_bounds[0] > 0)
+        if (rows_out_of_bounds[0] > 0)
         {
             Window slice_fill_rows_before(full_window);
             slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
-            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
-            _internal_kernels.push_back(std::move(kernel));
+            auto kernel = std::make_unique<CLFill>();
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_before);
+            //_internal_functions.emplace_back(std::move(kernel));
+            _internal_functions.push_back(std::move(kernel));
         }
 
         Window slice_in(full_window);
-        slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
-        slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+        slice_in.set(2,
+                     Window::Dimension(rows_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+        slice_in.set(1,
+                     Window::Dimension(cols_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
 
-        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
-        if(rows_in_bounds > 0)
+        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) -
+                             rows_out_of_bounds[0] - rows_out_of_bounds[1];
+        if (rows_in_bounds > 0)
         {
             // Fill all elements that share a row with an in bounds element with the extrapolation value.
-            if(cols_out_of_bounds[0] > 0)
+            if (cols_out_of_bounds[0] > 0)
             {
                 Window slice_fill_cols_before(slice_in);
                 slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
-                auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
-                _internal_kernels.push_back(std::move(kernel));
+                auto kernel = std::make_unique<CLFill>();
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_before);
+                //_internal_functions.emplace_back(std::move(kernel));
+                _internal_functions.push_back(std::move(kernel));
             }
 
-            if(cols_out_of_bounds[1] > 0)
+            if (cols_out_of_bounds[1] > 0)
             {
                 Window slice_fill_cols_after(slice_in);
-                slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
-                auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
-                _internal_kernels.push_back(std::move(kernel));
+                slice_fill_cols_after.set(
+                    1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1],
+                                         _crop_results[num_box].get()->info()->dimension(1), 1));
+                auto kernel = std::make_unique<CLFill>();
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_after);
+                //_internal_functions.emplace_back(std::move(kernel));
+                _internal_functions.push_back(std::move(kernel));
             }
 
             // Copy all elements within the input bounds from the input tensor.
-            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
-            if(cols_in_bounds > 0)
+            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) -
+                                 cols_out_of_bounds[0] - cols_out_of_bounds[1];
+            if (cols_in_bounds > 0)
             {
-                Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
-                                        is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
-                Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
-                                      is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
-                auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
-
-                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
-                _internal_kernels.push_back(std::move(kernel));
+                Coordinates2D start_in{
+                    is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]};
+                Coordinates2D end_in{
+                    is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                    is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1};
+                auto kernel = std::make_unique<CLCrop>();
+
+                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index,
+                                  extrapolation_value, &slice_in);
+                //_internal_functions.emplace_back(std::move(kernel));
+                _internal_functions.push_back(std::move(kernel));
             }
         }
 
         // Fill all rows after the in bounds elements with the extrapolation value.
-        if(rows_out_of_bounds[1] > 0)
+        if (rows_out_of_bounds[1] > 0)
         {
             Window slice_fill_rows_after(full_window);
-            slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
-            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
-            _internal_kernels.push_back(std::move(kernel));
+            slice_fill_rows_after.set(
+                2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1],
+                                     _crop_results[num_box].get()->info()->dimension(2), 1));
+            auto kernel = std::make_unique<CLFill>();
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_after);
+            //_internal_functions.emplace_back(std::move(kernel));
+            _internal_functions.push_back(std::move(kernel));
         }
     }
     _boxes->unmap(CLScheduler::get().queue());
@@ -264,21 +357,21 @@ void CLCropResize::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _internal_kernels.size(); ++i)
+    for (unsigned int i = 0; i < _internal_functions.size(); ++i)
     {
-        CLScheduler::get().enqueue(*(_internal_kernels[i]));
+        _internal_functions[i]->run();
     }
 
     CLScheduler::get().sync();
-    for(auto &kernel : _scale)
+    for (auto &kernel : _scale)
     {
         kernel->run();
     }
     CLScheduler::get().sync();
-    for(auto &kernel : _copy)
+    for (auto &kernel : _copy)
     {
-        CLScheduler::get().enqueue(*kernel, true);
+        kernel->run();
     }
     CLScheduler::get().sync();
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 62e7d9a582..4e0d1501ba 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,18 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/operators/ClTransposedConvolution.h"
+
 #include <cmath>
 #include <memory>
 #include <tuple>
@@ -36,34 +42,70 @@
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
+struct CLDeconvolutionLayer::Impl
+{
+    const ICLTensor                     *src{nullptr};
+    const ICLTensor                     *weights{nullptr};
+    const ICLTensor                     *biases{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
+};
+
+CLDeconvolutionLayer::~CLDeconvolutionLayer() = default;
+
 CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _function()
+    : _memory_manager(std::move(memory_manager)), _function(), _impl(std::make_unique<Impl>())
 {
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(ICLTensor           *input,
+                                     ICLTensor           *weights,
+                                     const ICLTensor     *bias,
+                                     ICLTensor           *output,
+                                     const PadStrideInfo &deconv_info,
+                                     const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
 }
 
-void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *weights,
+                                     const ICLTensor        *bias,
+                                     ICLTensor              *output,
+                                     const PadStrideInfo    &deconv_info,
+                                     const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info);
 
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(),
+                                                           deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
-            auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>();
+            auto op = std::make_unique<opencl::ClTransposedConvolution>();
+            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr,
+                          output->info(), deconv_info);
+
+            _impl->src     = input;
+            _impl->weights = weights;
+            _impl->biases  = bias;
+            _impl->dst     = output;
+
+            _impl->op = std::move(op);
+            break;
+        }
+        case DeconvolutionMethod::UPSCALE_CONV2D:
+        {
+            auto f = std::make_unique<CLDirectDeconvolutionLayer>();
             f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info);
             _function = std::move(f);
             break;
         }
         case DeconvolutionMethod::GEMM:
         {
-            auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+            auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
             f->configure(compile_context, input, weights, bias, output, deconv_info);
             _function = std::move(f);
             break;
@@ -74,16 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC
     }
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                      const WeightsInfo &weights_info)
+Status CLDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      ITensorInfo         *output,
+                                      const PadStrideInfo &deconv_info,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
+            // Validate transposed convolution operator
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
+            break;
+        }
+        case DeconvolutionMethod::UPSCALE_CONV2D:
+        {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
             break;
         }
         case DeconvolutionMethod::GEMM:
@@ -100,19 +154,40 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
     return Status{};
 }
 
-DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                                   const WeightsInfo &weights_info)
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo   *input,
+                                                                   const ITensorInfo   *weights,
+                                                                   const ITensorInfo   *bias,
+                                                                   ITensorInfo         *output,
+                                                                   const PadStrideInfo &deconv_info,
+                                                                   const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
+    if (is_data_type_quantized_per_channel(weights->data_type()))
+    {
+        return DeconvolutionMethod::UPSCALE_CONV2D;
+    }
+
     const DataLayout data_layout = input->data_layout();
 
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    const size_t ofm   = weights->tensor_shape()[idx_n];
 
-    if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
+    if (weights->dimension(idx_w) != deconv_info.stride().first ||
+        weights->dimension(idx_h) != deconv_info.stride().second)
     {
-        return DeconvolutionMethod::DIRECT;
+        // We observe better performance for FP32 types only when ofm <= 16, and for FP16 only when ofm <= 32.
+        if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)) &&
+            !((input->data_type() == DataType::F16) && (ofm > 32)))
+        {
+            return DeconvolutionMethod::DIRECT;
+        }
+        else
+        {
+            return DeconvolutionMethod::UPSCALE_CONV2D;
+        }
     }
 
     return DeconvolutionMethod::GEMM;
@@ -121,10 +196,29 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor
 void CLDeconvolutionLayer::run()
 {
     prepare();
-    _function->run();
+
+    if (_impl->op != nullptr)
+    {
+        // Optimized Operator will be used
+        ITensorPack pack;
+
+        pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+        pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+        pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+        pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+        _impl->op->run(pack);
+    }
+    else
+    {
+        _function->run();
+    }
 }
 
 void CLDeconvolutionLayer::prepare()
 {
-    _function->prepare();
+    if (_impl->op == nullptr)
+    {
+        _function->prepare();
+    }
 }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index be2d120dcd..b92bf903a6 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,16 +28,20 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+
 namespace arm_compute
 {
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
-    : _upsample(),
-      _memset(),
-      _output(nullptr)
+    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr)
 {
 }
 
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
+CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
+
+Status
+CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
     return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
 }
@@ -47,18 +51,23 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output, info);
 
     _output = output;
-    _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
-    _upsample.configure(compile_context, input, _output, info);
+    _fill.configure(compile_context, _output,
+                    PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _upsample->configure(compile_context, input, _output, info);
 }
 
 void CLDeconvolutionLayerUpsample::run()
 {
-    CLScheduler::get().enqueue(_memset, false);
-    CLScheduler::get().enqueue(_upsample, true);
+    _fill.run();
+    CLScheduler::get().enqueue(*_upsample, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index b848f989e6..6d2fea974e 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,27 +23,66 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct CLDepthConvertLayer::Impl
+{
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
+};
+
+CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&)            = default;
+CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::~CLDepthConvertLayer()                                 = default;
+
 void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
 }
 
-void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    ConvertPolicy           policy,
+                                    uint32_t                shift)
+{
+    ARM_COMPUTE_UNUSED(shift);
+    ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    ARM_COMPUTE_ERROR_ON(shift != 0);
+
+    _impl->op = std::make_unique<opencl::ClCast>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
+}
+
+Status
+CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
-    k->configure(compile_context, input, output, policy, shift);
-    _kernel = std::move(k);
+    ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
+    return opencl::ClCast::validate(input, output, policy);
 }
 
-Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::run()
 {
-    return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 89e5faa4d5..9477c7f81d 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include <utility>
 
@@ -35,9 +35,13 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+    auto k = std::make_unique<CLDepthToSpaceLayerKernel>();
     k->configure(compile_context, input, output, block_shape);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index b1e9fe77d4..873601bb11 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,102 +24,26 @@
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_dwc;
 
-namespace
-{
-Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                              unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
-{
-    // This function should be removed and incorporated inside CLDepthwiseConvolutionLayerInternal3x3 once CLDepthwiseConvolutionLayer3x3 is properly removed
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-
-    const bool                      is_quantized           = is_data_type_quantized_asymmetric(input->data_type());
-    const bool                      is_nhwc                = input->data_layout() == DataLayout::NHWC;
-    const bool                      needs_permute          = is_nhwc && (depth_multiplier > 1);
-    const bool                      needs_weights_reshape  = is_nhwc && (depth_multiplier == 1) && is_quantized;
-    const bool                      is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool                      is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
-    const bool                      is_dot8_supported      = dot8_supported(CLKernelLibrary::get().get_device());
-    DepthwiseConvolutionReshapeInfo info;
-    info.c0        = 4;
-    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
-
-    TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
-    if(is_quantized)
-    {
-        if(is_data_type_quantized_per_channel(weights->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-
-            const size_t idx_c = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
-            output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-        }
-    }
-
-    if(needs_permute)
-    {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-
-        permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
-        permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
-        permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
-
-        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
-        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
-        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
-                                                                                       conv_info, depth_multiplier, act_info, gpu_target,
-                                                                                       dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
-    }
-    else if(is_nhwc)
-    {
-        if(needs_weights_reshape)
-        {
-            auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases,
-                                                                                           output, conv_info, depth_multiplier, act_info,
-                                                                                           dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info,
-                                                                                           dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target,
-                                                                                       dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
-    }
-    return Status{};
-}
-} // namespace
-
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _dwc_native_kernel(),
+      _dwc_native_kernel(std::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()),
       _permute_input_to_nhwc(),
       _permute_weights_to_nhwc(),
       _permute_output_to_nchw(),
@@ -137,25 +61,36 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv
 {
 }
 
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
+
+void CLDepthwiseConvolutionLayer::configure(ICLTensor           *input,
+                                            const ICLTensor     *weights,
+                                            const ICLTensor     *biases,
+                                            ICLTensor           *output,
+                                            const PadStrideInfo &conv_info,
+                                            unsigned int         depth_multiplier,
+                                            ActivationLayerInfo  act_info,
+                                            const Size2D        &dilation)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier,
+              act_info, dilation);
 }
 
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                                                                                ICLTensor *output, const PadStrideInfo &conv_info,
-                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context,
+                                            ICLTensor              *input,
+                                            const ICLTensor        *weights,
+                                            const ICLTensor        *biases,
+                                            ICLTensor              *output,
+                                            const PadStrideInfo    &conv_info,
+                                            unsigned int            depth_multiplier,
+                                            ActivationLayerInfo     act_info,
+                                            const Size2D           &dilation)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
-                                                                     weights->info(),
-                                                                     biases != nullptr ? biases->info() : nullptr,
-                                                                     output->info(),
-                                                                     conv_info,
-                                                                     depth_multiplier,
-                                                                     act_info,
-                                                                     dilation));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr,
+        output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
 
     _is_quantized     = is_data_type_quantized(input->info()->data_type());
     _is_prepared      = false;
@@ -164,10 +99,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
     _output           = output;
     _needs_permute    = input->info()->data_layout() == DataLayout::NCHW;
 
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -190,10 +127,12 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
 
     CLTensor *output_multipliers_to_use = nullptr;
     CLTensor *output_shifts_to_use      = nullptr;
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+        const size_t idx_c =
+            get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t num_filters =
+            (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
 
         _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
         _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
@@ -202,15 +141,19 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
         output_shifts_to_use      = &_output_shifts;
     }
 
-    DWCWeightsKernelInfo dwc_weights_info;
-    dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
-    DWCKernelInfo dwc_info;
-    dwc_info.activation_info = act_info;
-    _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
-                                 dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
-                                 output_multipliers_to_use, output_shifts_to_use);
+    // Get the depthwise convolution compute parameters
+    auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+    const DWCComputeKernelInfo dwc_native_compute_info =
+        t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
+
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
-    if(_needs_permute)
+    _dwc_native_kernel->set_target(gpu_target);
+    _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
+                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use,
+                                  output_shifts_to_use);
+
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
 
@@ -220,37 +163,51 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
         _permuted_output.allocator()->allocate();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _output_multipliers.allocator()->allocate();
         _output_shifts.allocator()->allocate();
     }
 }
 
-Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                 const PadStrideInfo &conv_info,
-                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo   *input,
+                                             const ITensorInfo   *weights,
+                                             const ITensorInfo   *biases,
+                                             const ITensorInfo   *output,
+                                             const PadStrideInfo &conv_info,
+                                             unsigned int         depth_multiplier,
+                                             ActivationLayerInfo  act_info,
+                                             const Size2D        &dilation)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
+
+    const bool in_place = input == output || output == nullptr;
+    if (in_place)
+    {
+        output = input;
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) >
+                                input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) >
+                                input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
-    DWCWeightsKernelInfo dwc_weights_info;
-    dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
-    DWCKernelInfo dwc_info;
-    dwc_info.activation_info = act_info;
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     const bool needs_permute = input->data_layout() == DataLayout::NCHW;
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
     TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
-    if(is_quantized)
+    if (is_quantized)
     {
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
 
@@ -263,72 +220,95 @@ Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::validate
         }
     }
 
-    if(needs_permute)
+    if (needs_permute)
     {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
+        TensorShape           permuted_input_shape   = input->tensor_shape();
+        TensorShape           permuted_weights_shape = weights->tensor_shape();
+        const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation};
+        TensorShape           permuted_output_shape =
+            shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_input = input->clone()
+                                              ->set_is_resizable(true)
+                                              .reset_padding()
+                                              .set_tensor_shape(permuted_input_shape)
+                                              .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_weights = weights->clone()
+                                                ->set_is_resizable(true)
+                                                .reset_padding()
+                                                .set_tensor_shape(permuted_weights_shape)
+                                                .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_output = output->clone()
+                                               ->set_is_resizable(true)
+                                               .reset_padding()
+                                               .set_tensor_shape(permuted_output_shape)
+                                               .set_data_layout(DataLayout::NHWC);
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_weights_info,
-                                                                                      dwc_info, conv_info, depth_multiplier, dilation,
-                                                                                      &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+
+        // Get the depthwise convolution compute parameters
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info,
+            &output_multipliers_shifts_info, &output_multipliers_shifts_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier,
-                                                                                      dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        // Get the depthwise convolution compute parameters
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(input, weights, conv_info, dilation, depth_multiplier);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
+            &output_multipliers_shifts_info));
     }
     return Status{};
 }
 
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run()
+void CLDepthwiseConvolutionLayer::run()
 {
     prepare();
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_to_nhwc.run();
     }
-    CLScheduler::get().enqueue(_dwc_native_kernel);
-    if(_needs_permute)
+    CLScheduler::get().enqueue(*_dwc_native_kernel);
+    if (_needs_permute)
     {
         _permute_output_to_nchw.run();
     }
 }
 
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
+void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _output_multipliers.map();
             _output_shifts.map();
-            const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
-            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
-                                                                   _original_weights->info(),
-                                                                   _output->info(),
-                                                                   idx_ofms,
-                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
-                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+            quantization::compute_quantized_multipliers_and_shifts(
+                _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(),
+                reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+                reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
             _output_multipliers.unmap();
             _output_shifts.unmap();
         }
 
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
@@ -339,315 +319,4 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
         _is_prepared = true;
     }
 }
-
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _kernel(nullptr),
-      _border_handler(),
-      _permute_input_to_nchw(),
-      _permute_weights_to_nchw(),
-      _permute_output_to_nhwc(),
-      _reshape_weights(),
-      _permuted_input(),
-      _permuted_weights(),
-      _permuted_output(),
-      _output_multipliers(),
-      _output_shifts(),
-      _original_weights(nullptr),
-      _input(nullptr),
-      _output(nullptr),
-      _needs_permute(false),
-      _needs_weights_reshape(false),
-      _is_prepared(false),
-      _is_quantized(false)
-{
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                                                    const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                                                                                    ICLTensor           *output,
-                                                                                    const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(),
-                                                                        weights->info(),
-                                                                        biases != nullptr ? biases->info() : nullptr,
-                                                                        output->info(),
-                                                                        conv_info,
-                                                                        depth_multiplier,
-                                                                        act_info,
-                                                                        gpu_target,
-                                                                        dilation));
-
-    const bool is_nhwc     = input->info()->data_layout() == DataLayout::NHWC;
-    _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _needs_permute         = is_nhwc && (depth_multiplier > 1);
-    _needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && _is_quantized;
-
-    _is_prepared      = false;
-    _original_weights = weights;
-    _input            = input;
-    _output           = output;
-
-    ICLTensor       *input_to_use   = input;
-    const ICLTensor *weights_to_use = weights;
-    ICLTensor       *output_to_use  = output;
-
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
-    const bool is_stride_1              = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool is_dot8_supported        = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
-    const bool is_stride_1_dilation_1   = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
-
-    DepthwiseConvolutionReshapeInfo info;
-    info.c0        = 4;
-    info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
-
-    if(_needs_permute)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input_to_nchw.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-        // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
-        input_to_use   = &_permuted_input;
-        weights_to_use = &_permuted_weights;
-        output_to_use  = &_permuted_output;
-
-        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
-    }
-    else if(is_nhwc)
-    {
-        if(_needs_weights_reshape)
-        {
-            _reshape_weights.configure(compile_context, weights, &_permuted_weights, info);
-            weights_to_use = &_permuted_weights;
-        }
-        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
-    }
-    else
-    {
-        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
-    }
-
-    CLTensor *output_multipliers_to_use = nullptr;
-    CLTensor *output_shifts_to_use      = nullptr;
-    if(_is_quantized)
-    {
-        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1;
-
-        _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-        _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-
-        output_multipliers_to_use = &_output_multipliers;
-        output_shifts_to_use      = &_output_shifts;
-    }
-
-    // Configure kernel
-    _kernel->set_target(gpu_target);
-    _kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
-                       act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
-
-    if(_is_quantized)
-    {
-        _output_multipliers.allocator()->allocate();
-        _output_shifts.allocator()->allocate();
-    }
-
-    // Permute output if needed
-    if(_needs_permute)
-    {
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-        _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U));
-
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
-    }
-    // Configure border handler
-    PixelValue &&zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
-    {
-        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
-    }
-    _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
-}
-
-Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                     const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, GPUTarget gpu_target, const Size2D &dilation)
-{
-    return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_needs_permute)
-    {
-        _permute_input_to_nchw.run();
-    }
-    CLScheduler::get().enqueue(_border_handler);
-    CLScheduler::get().enqueue(*_kernel);
-
-    if(_needs_permute)
-    {
-        _permute_output_to_nhwc.run();
-    }
-}
-
-void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare()
-{
-    if(!_is_prepared)
-    {
-        if(_is_quantized)
-        {
-            _output_multipliers.map();
-            _output_shifts.map();
-            const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
-            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
-                                                                   _original_weights->info(),
-                                                                   _output->info(),
-                                                                   idx_ofms,
-                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
-                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
-            _output_multipliers.unmap();
-            _output_shifts.unmap();
-        }
-
-        if(_needs_permute)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-            _permuted_weights.allocator()->allocate();
-            _permute_weights_to_nchw.run();
-            _original_weights->mark_as_unused();
-        }
-
-        if(_needs_weights_reshape)
-        {
-            ARM_COMPUTE_ERROR_ON(_needs_permute);
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-            _permuted_weights.allocator()->allocate();
-            CLScheduler::get().enqueue(_reshape_weights);
-            _original_weights->mark_as_unused();
-        }
-        _is_prepared = true;
-    }
-}
-
-CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_3x3(), _func_generic()
-{
-}
-
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                            ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                            const PadStrideInfo &conv_info,
-                                            unsigned int        depth_multiplier,
-                                            ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    const GPUTarget gpu_target = CLScheduler::get().target();
-    _depth_conv_func           = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
-                                                                   dilation, gpu_target);
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_3x3.set_memory_group(_memory_manager);
-            _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-        {
-            _func_generic.set_memory_group(_memory_manager);
-            _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
-{
-    const GPUTarget              gpu_target      = CLScheduler::get().target();
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, gpu_target);
-    switch(depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            return CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
-        case DepthwiseConvolutionFunction::GENERIC:
-            return CLDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-DepthwiseConvolutionFunction CLDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                            const PadStrideInfo &conv_info,
-                                                                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation, GPUTarget gpu_target)
-{
-    if(bool(CLDepthwiseConvolutionLayerInternal3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation)) && (is_data_type_float(input->data_type())
-            || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD))
-    {
-        return DepthwiseConvolutionFunction::OPTIMIZED;
-    }
-    else
-    {
-        return DepthwiseConvolutionFunction::GENERIC;
-    }
-}
-
-void CLDepthwiseConvolutionLayer::run()
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_3x3.run();
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.run();
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
-
-void CLDepthwiseConvolutionLayer::prepare()
-{
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_3x3.prepare();
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.prepare();
-            break;
-        default:
-            ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
-    }
-}
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 362b36cc95..20162a03db 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,55 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
 
 namespace arm_compute
 {
+struct CLDequantizationLayer::Impl
+{
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClDequantize> op{nullptr};
+};
+
+CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLDequantizationLayer::~CLDequantizationLayer() = default;
+
 void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
 }
 
-void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_LOG_PARAMS(input, output);
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClDequantize>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 
 Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLDequantizationLayerKernel::validate(input, output);
+    return opencl::ClDequantize::validate(input, output);
+}
+
+void CLDequantizationLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
deleted file mode 100644
index 68d3752463..0000000000
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
deleted file mode 100644
index 05351a9de3..0000000000
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLDilate.h"
-
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 6e9782f77a..d6dae0d732 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,76 +24,81 @@
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
-CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
+namespace arm_compute
+{
+struct CLDirectConvolutionLayer::Impl
+{
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv2d> op{nullptr};
+};
+
+CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
+CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&)            = default;
+CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default;
+CLDirectConvolutionLayer::~CLDirectConvolutionLayer()                                      = default;
 
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info,
+void CLDirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
                                          const ActivationLayerInfo &act_info)
 {
-    // Set GPU target
-    _direct_conv_kernel.set_target(CLScheduler::get().target());
-
-    // Configure direct convolution
-    _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
 
-    // Configure border handler
-    PixelValue &&zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
-    {
-        zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
-    }
-    _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _impl->src     = input;
+    _impl->weights = weights;
+    _impl->biases  = biases;
+    _impl->dst     = output;
 
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
-
-    _is_activationlayer_enabled = act_info.enabled();
-
-    //Configure Activation Layer
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(compile_context, output, nullptr, act_info);
-    }
+    _impl->op = std::make_unique<opencl::ClDirectConv2d>();
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target()));
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
-    }
-    return Status{};
+    return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
 }
 
 void CLDirectConvolutionLayer::run()
 {
-    // Run border handler
-    CLScheduler::get().enqueue(_input_border_handler, false);
-
-    // Run direct convolution
-    CLScheduler::get().enqueue(_direct_conv_kernel);
-
-    //Run Activation Layer
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index da16bed3e0..7cd268ab0b 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,18 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <memory>
 #include <tuple>
 
@@ -49,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa
 {
 }
 
-Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                                            const WeightsInfo &weights_info)
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                            const ITensorInfo   *weights,
+                                            const ITensorInfo   *bias,
+                                            ITensorInfo         *output,
+                                            const PadStrideInfo &info,
+                                            const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     const DataLayout data_layout = input->data_layout();
 
@@ -61,18 +72,25 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1);
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                    weights->dimension(idx_w), weights->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-    if(bias != nullptr)
+    if (input->data_type() != weights->data_type())
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL ||
+                                    !is_data_type_quantized_asymmetric(input->data_type()));
+    }
+
+    if (bias != nullptr)
+    {
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -91,26 +109,42 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
     unsigned int        deconv_pad_y    = 0;
     const unsigned int  stride_x        = info.stride().first;
     const unsigned int  stride_y        = info.stride().second;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                                out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo          scale_out_info(input->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(scale_out_shape)
+                                           .set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
     return Status{};
 }
 
-void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(ICLTensor           *input,
+                                           ICLTensor           *weights,
+                                           const ICLTensor     *bias,
+                                           ICLTensor           *output,
+                                           const PadStrideInfo &info,
+                                           const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
 }
 
-void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor              *input,
+                                           ICLTensor              *weights,
+                                           const ICLTensor        *bias,
+                                           ICLTensor              *output,
+                                           const PadStrideInfo    &info,
+                                           const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info);
 
     const unsigned int pad_left   = info.pad_left();
     const unsigned int pad_right  = info.pad_right();
@@ -127,17 +161,21 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _original_weights = weights;
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-    _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+    _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, /* use_inverted_axis */ false);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
+    auto out_dims =
+        deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+                                        weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
 
     _is_prepared = weights_info.retain_internal_weights();
 
@@ -146,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
     unsigned int      deconv_pad_x    = 0;
     unsigned int      deconv_pad_y    = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
 
     unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
     unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
@@ -167,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _scaled_output.allocator()->init(scale_out_info);
 
     // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top,
+                                      deconv_pad_bottom, DimensionRoundingType::FLOOR);
     _scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
@@ -179,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _flip_axis.allocator()->allocate();
     _flip_axis.map(true);
     auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
-    if(weights->info()->data_layout() == DataLayout::NHWC)
+    if (weights->info()->data_layout() == DataLayout::NHWC)
     {
         axis_data[0] = 1;
         axis_data[1] = 2;
@@ -204,7 +244,7 @@ void CLDirectDeconvolutionLayer::run()
 
 void CLDirectDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
@@ -217,11 +257,10 @@ void CLDirectDeconvolutionLayer::prepare()
         _conv_f.prepare();
 
         // Free flipped weights
-        if(!_weights_flipped.is_used())
+        if (!_weights_flipped.is_used())
         {
             _weights_flipped.allocator()->free();
         }
-
         _is_prepared = true;
     }
 }
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
deleted file mode 100644
index ce615327a9..0000000000
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
-
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::RSQRT);
-    _kernel = std::move(k);
-}
-Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
-}
-
-void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::EXP);
-    _kernel = std::move(k);
-}
-Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
-}
-
-void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::NEG);
-    _kernel = std::move(k);
-}
-Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::NEG);
-}
-
-void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::SIN);
-    _kernel = std::move(k);
-}
-Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::SIN);
-}
-
-void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::ABS);
-    _kernel = std::move(k);
-}
-Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS);
-}
-void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::LOG);
-    _kernel = std::move(k);
-}
-Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::LOG);
-}
-
-void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::ROUND);
-    _kernel = std::move(k);
-}
-Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ROUND);
-}
-
-} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 20e9545b61..d9529f0b7f 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,155 +23,395 @@
  */
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Types.h"
 
-#include <utility>
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClElementwiseOperations.h"
+#include "src/gpu/cl/operators/ClSub.h"
 
 namespace arm_compute
 {
-namespace
+struct CLArithmeticAddition::Impl
 {
-void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
-{
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAdd> op{nullptr};
+};
 
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
-        }
-    }
+CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>())
+{
 }
-} // namespace
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&)            = default;
+CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::~CLArithmeticAddition()                                  = default;
 
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(
+    ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     ConvertPolicy              policy,
+                                     const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClAdd>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
+    return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
 }
 
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::run()
 {
-    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+struct CLArithmeticSubtraction::Impl
+{
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSub> op{nullptr};
+};
+
+CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>())
+{
+}
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&)            = default;
+CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::~CLArithmeticSubtraction()                                     = default;
+
+void CLArithmeticSubtraction::configure(const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClSub>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
+{
+    return opencl::ClSub::validate(input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticSubtraction::run()
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+struct CLArithmeticDivision::Impl
+{
+    const ICLTensor                               *src_0{nullptr};
+    const ICLTensor                               *src_1{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr};
+};
+
+CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>())
 {
-    ARM_COMPUTE_UNUSED(policy);
-    return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
 }
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&)            = default;
+CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::~CLArithmeticDivision()                                  = default;
 
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(ICLTensor                 *input1,
+                                     ICLTensor                 *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClElementwiseDivision>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLArithmeticDivision::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output, act_info);
+    return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info);
 }
 
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLElementwiseMax::Impl
+{
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMax> op{nullptr};
+};
+
+CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>())
+{
+}
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&)            = default;
+CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
+CLElementwiseMax::~CLElementwiseMax()                              = default;
+
+void CLElementwiseMax::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClElementwiseMax>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output, act_info);
+    return opencl::ClElementwiseMax::validate(input1, input2, output, act_info);
 }
 
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLElementwiseMin::Impl
+{
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMin> op{nullptr};
+};
+
+CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>())
+{
+}
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&)            = default;
+CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
+CLElementwiseMin::~CLElementwiseMin()                              = default;
+
+void CLElementwiseMin::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClElementwiseMin>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output, act_info);
+    return opencl::ClElementwiseMin::validate(input1, input2, output, act_info);
 }
 
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLElementwiseSquaredDiff::Impl
+{
+    const ICLTensor                                  *src_0{nullptr};
+    const ICLTensor                                  *src_1{nullptr};
+    ICLTensor                                        *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr};
+};
+
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
+{
+}
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&)            = default;
+CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff()                                      = default;
+
+void CLElementwiseSquaredDiff::configure(ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClElementwiseSquaredDiff>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
+{
+    return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info);
+}
+
+void CLElementwiseSquaredDiff::run()
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+struct CLElementwisePower::Impl
+{
+    const ICLTensor                            *src_0{nullptr};
+    const ICLTensor                            *src_1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwisePower> op{nullptr};
+};
+
+CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>())
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
 }
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&)            = default;
+CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
+CLElementwisePower::~CLElementwisePower()                                = default;
 
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClElementwisePower>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output, act_info);
+    return opencl::ClElementwisePower::validate(input1, input2, output, act_info);
 }
 
+void CLElementwisePower::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
new file mode 100644
index 0000000000..3043c26feb
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClElementwiseUnary.h"
+
+namespace arm_compute
+{
+struct CLRsqrtLayer::Impl
+{
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRsqrt> op{nullptr};
+};
+
+CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&)            = default;
+CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::~CLRsqrtLayer()                          = default;
+
+void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClRsqrt>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+
+Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClRsqrt::validate(input, output);
+}
+
+void CLRsqrtLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLExpLayer::Impl
+{
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClExp> op{nullptr};
+};
+
+CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLExpLayer::CLExpLayer(CLExpLayer &&)            = default;
+CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
+CLExpLayer::~CLExpLayer()                        = default;
+
+void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClExp>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+
+Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClExp::validate(input, output);
+}
+
+void CLExpLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLNegLayer::Impl
+{
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClNeg> op{nullptr};
+};
+
+CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLNegLayer::CLNegLayer(CLNegLayer &&)            = default;
+CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
+CLNegLayer::~CLNegLayer()                        = default;
+
+void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClNeg>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClNeg::validate(input, output);
+}
+
+void CLNegLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLSinLayer::Impl
+{
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSin> op{nullptr};
+};
+
+CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLSinLayer::CLSinLayer(CLSinLayer &&)            = default;
+CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
+CLSinLayer::~CLSinLayer()                        = default;
+
+void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClSin>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClSin::validate(input, output);
+}
+
+void CLSinLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLAbsLayer::Impl
+{
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAbs> op{nullptr};
+};
+
+CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&)            = default;
+CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
+CLAbsLayer::~CLAbsLayer()                        = default;
+
+void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClAbs>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClAbs::validate(input, output);
+}
+
+void CLAbsLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLLogLayer::Impl
+{
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClLog> op{nullptr};
+};
+
+CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLLogLayer::CLLogLayer(CLLogLayer &&)            = default;
+CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
+CLLogLayer::~CLLogLayer()                        = default;
+
+void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClLog>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClLog::validate(input, output);
+}
+
+void CLLogLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLRoundLayer::Impl
+{
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRound> op{nullptr};
+};
+
+CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&)            = default;
+CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
+CLRoundLayer::~CLRoundLayer()                          = default;
+
+void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClRound>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClRound::validate(input, output);
+}
+
+void CLRoundLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
deleted file mode 100644
index e1bd7e6f2a..0000000000
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
-
-#include "arm_compute/core/CL/ICLDistribution1D.h"
-#include "arm_compute/core/CL/ICLLut.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <numeric>
-
-using namespace arm_compute;
-
-namespace
-{
-void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut)
-{
-    dist.map(true);
-    cum_dist.map(true);
-    lut.map(true);
-
-    const uint32_t *dist_ptr     = dist.buffer();
-    uint32_t       *cum_dist_ptr = cum_dist.buffer();
-    uint8_t        *lut_ptr      = lut.buffer();
-
-    ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr);
-    ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr);
-    ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr);
-
-    // Calculate cumulative distribution
-    std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr);
-
-    // Get the number of pixels that have the lowest value in the input image
-    const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v)
-    {
-        return v > 0;
-    });
-    const size_t image_size = cum_dist_ptr[255];
-
-    if(image_size == num_lowest_pixels)
-    {
-        std::iota(lut_ptr, lut_ptr + 256, 0);
-    }
-    else
-    {
-        const float diff = image_size - num_lowest_pixels;
-
-        for(size_t i = 0; i < 256; ++i)
-        {
-            lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f);
-        }
-    }
-
-    dist.unmap();
-    cum_dist.unmap();
-    lut.unmap();
-}
-} // namespace
-
-CLEqualizeHistogram::CLEqualizeHistogram()
-    : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
-{
-}
-
-void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output)
-{
-    _histogram_kernel.configure(compile_context, input, &_hist);
-    _border_histogram_kernel.configure(compile_context, input, &_hist);
-    _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output);
-}
-
-void CLEqualizeHistogram::run()
-{
-    // Calculate histogram of input.
-    CLScheduler::get().enqueue(_histogram_kernel, false);
-
-    // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
-    CLScheduler::get().enqueue(_border_histogram_kernel, false);
-
-    // Calculate cumulative distribution of histogram and create LUT.
-    calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
-
-    // Map input to output using created LUT.
-    CLScheduler::get().enqueue(_map_histogram_kernel);
-}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
deleted file mode 100644
index 8106148316..0000000000
--- a/src/runtime/CL/functions/CLErode.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLErode.h"
-
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index c3922f5e66..48e9ae824a 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,25 +25,43 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/utils/helpers/fft.h"
+
 namespace arm_compute
 {
 CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false)
+    : _memory_group(std::move(memory_manager)),
+      _digit_reverse_kernel(std::make_unique<CLFFTDigitReverseKernel>()),
+      _fft_kernels(),
+      _scale_kernel(std::make_unique<CLFFTScaleKernel>()),
+      _digit_reversed_input(),
+      _digit_reverse_indices(),
+      _num_ffts(0),
+      _run_scale(false)
 {
 }
 
+CLFFT1D::~CLFFT1D() = default;
+
 void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+void CLFFT1D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT1DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
+    ARM_COMPUTE_LOG_PARAMS(input, output, config);
 
     // Decompose size to radix factors
     const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
@@ -62,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices,
+                                     digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
     _num_ffts       = decomposed_vector.size();
-    _fft_kernels.resize(_num_ffts);
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    _fft_kernels.reserve(_num_ffts);
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -77,18 +96,21 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
         fft_kernel_info.radix          = radix_for_stage;
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
-        _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>());
+        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input,
+                                       ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config)
+               : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -105,9 +127,9 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
 Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
@@ -116,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
@@ -132,18 +154,18 @@ void CLFFT1D::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run digit reverse
-    CLScheduler::get().enqueue(_digit_reverse_kernel, false);
+    CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
 
     // Run radix kernels
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
-        CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
+        CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
-        CLScheduler::get().enqueue(_scale_kernel, true);
+        CLScheduler::get().enqueue(*_scale_kernel, true);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 2482ea901a..3857046719 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,22 +27,36 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+
 namespace arm_compute
 {
 CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
+CLFFT2D::~CLFFT2D() = default;
+
 void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+void CLFFT2D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT2DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
+    ARM_COMPUTE_LOG_PARAMS(input, output, config);
 
     // Setup first pass
     FFT1DInfo first_pass_config;
@@ -62,6 +76,7 @@ void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor
 Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT2DInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
     // Create intermediate tensor info
     TensorInfo first_pass_tensor(input->clone()->set_is_resizable(true).reset_padding().set_num_channels(2));
@@ -79,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index ff439cca8d..2a73517549 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,21 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/fft.h"
+
 namespace arm_compute
 {
 namespace
@@ -41,11 +50,11 @@ int pad_decomposable(int N)
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -95,15 +104,33 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
 {
 }
 
-void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info)
+void CLFFTConvolutionLayer::configure(ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info)
+void CLFFTConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                      ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
+    ARM_COMPUTE_UNUSED(enable_fast_math);
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(),
+                                                               biases != nullptr ? biases->info() : nullptr,
+                                                               output->info(), conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+
     _original_weights = weights;
     _original_bias    = biases;
 
@@ -111,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -133,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -151,21 +181,22 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     // Flip weights
     _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
-    _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
+    _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis,
+                                 /* use_inverted_axis */ false);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
-    _transform_weights_func = support::cpp14::make_unique<CLFFT2D>();
+    _transform_weights_func = std::make_unique<CLFFT2D>();
     _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -189,7 +220,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -201,25 +233,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use,
+                                   Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -230,7 +265,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -242,7 +277,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(compile_context, output, nullptr, act_info);
     }
@@ -256,10 +291,16 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _flip_axis.unmap();
 }
 
-Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info)
+Status CLFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
     // Get indices for the width and height
@@ -273,25 +314,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channels] != biases->tensor_shape().x());
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x());
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
         }
@@ -307,7 +350,7 @@ void CLFFTConvolutionLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -323,17 +366,17 @@ void CLFFTConvolutionLayer::run()
     _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
     _extract_output_func.run();
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -341,10 +384,10 @@ void CLFFTConvolutionLayer::run()
 
 void CLFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -353,7 +396,7 @@ void CLFFTConvolutionLayer::prepare()
 
         const ICLTensor *cur_weights = _original_weights;
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
deleted file mode 100644
index f51abf0880..0000000000
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-#include <algorithm>
-#include <cstring>
-
-using namespace arm_compute;
-
-CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _fast_corners_kernel(),
-      _suppr_func(),
-      _copy_array_kernel(),
-      _output(),
-      _suppr(),
-      _win(),
-      _non_max(false),
-      _num_corners(nullptr),
-      _num_buffer(),
-      _corners(nullptr),
-      _constant_border_value(0)
-{
-}
-
-void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
-                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value);
-}
-
-void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
-                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
-    ARM_COMPUTE_ERROR_ON(nullptr == corners);
-    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8);
-    _output.allocator()->init(tensor_info);
-
-    _non_max               = nonmax_suppression;
-    _num_corners           = num_corners;
-    _corners               = corners;
-    _num_buffer            = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
-    _constant_border_value = constant_border_value;
-
-    const bool update_number = (nullptr != _num_corners);
-
-    _memory_group.manage(&_output);
-    _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
-
-    if(!_non_max)
-    {
-        _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer);
-    }
-    else
-    {
-        _suppr.allocator()->init(tensor_info);
-        _memory_group.manage(&_suppr);
-
-        _suppr_func.configure(compile_context, &_output, &_suppr, border_mode);
-        _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
-
-        _suppr.allocator()->allocate();
-    }
-
-    // Allocate intermediate tensors
-    _output.allocator()->allocate();
-}
-
-void CLFastCorners::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_non_max)
-    {
-        ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
-        const auto out_buffer = static_cast<unsigned char *>(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size()));
-        memset(out_buffer, 0, _output.info()->total_size());
-        q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
-    }
-
-    CLScheduler::get().enqueue(_fast_corners_kernel, false);
-
-    if(_non_max)
-    {
-        _suppr_func.run();
-    }
-
-    CLScheduler::get().enqueue(_copy_array_kernel, false);
-
-    unsigned int get_num_corners = 0;
-    q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
-
-    size_t corner_size = std::min(static_cast<size_t>(get_num_corners), _corners->max_num_values());
-
-    _corners->resize(corner_size);
-
-    if(_num_corners != nullptr)
-    {
-        *_num_corners = get_num_corners;
-    }
-
-    q.flush();
-}
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 7b96ed1592..9bd96a975e 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,22 +23,59 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFill.h"
 
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClFill.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLFill::configure(ICLTensor *tensor, PixelValue constant_value)
+struct CLFill::Impl
+{
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClFill> op{nullptr};
+};
+
+CLFill::CLFill() : _impl(std::make_unique<Impl>())
+{
+}
+CLFill::CLFill(CLFill &&)            = default;
+CLFill &CLFill::operator=(CLFill &&) = default;
+CLFill::~CLFill()                    = default;
+
+void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window);
+}
+
+void CLFill::configure(const CLCompileContext &compile_context,
+                       ICLTensor              *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+
+    _impl->src = tensor;
+
+    _impl->op = std::make_unique<opencl::ClFill>();
+    _impl->op->configure(compile_context, _impl->src->info(), constant_value, dst_window);
+}
+
+Status CLFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value);
+    return opencl::ClFill::validate(tensor, constant_value, dst_window);
 }
 
-void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value)
+void CLFill::run()
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-    k->configure(compile_context, tensor, constant_value);
-    _kernel = std::move(k);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
deleted file mode 100644
index f9d7396c5b..0000000000
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value);
-}
-
-void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>();
-    k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 9a247ccfcb..ba1b5372d3 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,31 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 
-using namespace arm_compute;
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/operators/ClFlatten.h"
+
+namespace arm_compute
+{
+struct CLFlattenLayer::Impl
+{
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClFlatten> op{nullptr};
+};
+
+CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&)            = default;
+CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default;
+CLFlattenLayer::~CLFlattenLayer()                            = default;
 
 void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
@@ -36,13 +56,33 @@ void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLFlattenLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLFlattenLayerKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-    CLScheduler::get().tune_kernel_static(*_kernel);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    _impl->src = input;
+    _impl->dst = output;
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
+
+    _impl->op = std::make_unique<opencl::ClFlatten>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
 }
 
 Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLFlattenLayerKernel::validate(input, output);
+    // Checks performed when output is configured
+    if (output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+    }
+    return opencl::ClFlatten::validate(input, output);
+}
+
+void CLFlattenLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 44e1d39dc2..4322219dd9 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,30 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFloor.h"
 
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClFloor.h"
 
 namespace arm_compute
 {
+struct CLFloor::Impl
+{
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClFloor> op{nullptr};
+};
+
+CLFloor::CLFloor() : _impl(std::make_unique<Impl>())
+{
+}
+CLFloor::CLFloor(CLFloor &&)            = default;
+CLFloor &CLFloor::operator=(CLFloor &&) = default;
+CLFloor::~CLFloor()                     = default;
+
 void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -35,13 +54,25 @@ void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLFloor::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClFloor>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
 }
 
 Status CLFloor::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLFloorKernel::validate(input, output);
+    return opencl::ClFloor::validate(input, output);
+}
+
+void CLFloor::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index ecbac6f703..b30f9e701f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,496 +23,137 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
 
-#include <algorithm>
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClFullyConnected.h"
 
 namespace arm_compute
 {
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::utils::cast;
+using namespace arm_compute::experimental;
 
-namespace
+struct CLFullyConnectedLayer::Impl
 {
-Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output,
-                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
-{
-    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset     = 0;
-    gemmlowp_output_stage.gemmlowp_multiplier = 0;
-    gemmlowp_output_stage.gemmlowp_shift      = 0;
-
-    const auto data_type = input.data_type();
-
-    // Configure output stage for quantized case
-    if(is_data_type_quantized_asymmetric(data_type))
-    {
-        const QuantizationInfo        oq_info = output.quantization_info();
-        const UniformQuantizationInfo iq_unif = input.quantization_info().uniform();
-        const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
-        const UniformQuantizationInfo oq_unif = oq_info.uniform();
-
-        const auto output_quant_info = (output.total_size() == 0) ? iq_unif : oq_unif;
-
-        const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
-        int         output_multiplier = 0;
-        int         output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-
-        if(activation_info.enabled())
-        {
-            switch(activation_info.activation())
-            {
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    type_min = PixelValue(oq_unif.offset);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    type_min = PixelValue(oq_unif.offset);
-                    type_max = PixelValue(activation_info.a(), data_type, oq_info);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    type_min = PixelValue(activation_info.b(), data_type, oq_info);
-                    type_max = PixelValue(activation_info.a(), data_type, oq_info);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Activation function not supported.");
-                    break;
-            }
-        }
-
-        // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
-        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
-        gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
-        gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
-        type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
-        type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
-    }
-
-    return Status{};
-}
-
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
-{
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage, fc_info.activation_info));
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
-                                         false,                           // is_b_reshaped
-                                         true,                            // reshape_b_only_on_first_run
-                                         0,                               // depth_output_gemm3d
-                                         false,                           // reinterpret_input_as_3d
-                                         fc_info.retain_internal_weights, // retain_internal_weights
-                                         gemmlowp_output_stage,           // gemmlowp_output_stage
-                                         fc_info.fp_mixed_precision,      // fp_mixed_precision
-                                         true,                            // broadcast_bias
-                                         ActivationLayerInfo());          // activation_info
-
-    if(is_data_type_quantized_asymmetric(input.data_type()))
-    {
-        const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
-
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset);
-        const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
-
-        // Validate gemmlowp function
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
-                                                                           &weights.clone()->set_quantization_info(weights_quantization_info),
-                                                                           bias,
-                                                                           &output,
-                                                                           gemm_info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLFullyConnectedLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
-}
+    MemoryGroup      memory_group{};
+    IWeightsManager *weights_manager{nullptr};
 
-Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return CLTransposeKernel::validate(input, output);
-}
+    std::unique_ptr<opencl::ClFullyConnected> op{nullptr};
 
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(),
-      _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true),
-      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
-{
-}
-void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                         const FullyConnectedLayerInfo &fc_info)
-{
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info);
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
-                                         false,                           // is_b_reshaped
-                                         true,                            // reshape_b_only_on_first_run
-                                         0,                               // depth_output_gemm3d
-                                         false,                           // reinterpret_input_as_3d
-                                         fc_info.retain_internal_weights, // retain_internal_weights
-                                         gemmlowp_output_stage,           // gemmlowp_output_stage
-                                         fc_info.fp_mixed_precision,      // fp_mixed_precision
-                                         true,                            // broadcast_bias
-                                         fc_info.activation_info);        // activation_info
-
-    if(_is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+    const ITensor *original_weights{nullptr};
 
-        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+    ITensorPack                      run_pack{};
+    WorkspaceData<CLTensor>          workspace{};
+    experimental::MemoryRequirements aux_mem_req{};
 
-        // Configure gemmlowp function
-        _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info);
+    bool is_prepared{false};
+    bool dynamic_weights{false};
+};
 
-        // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
-        input->info()->set_quantization_info(input_quantization_info);
-        weights->info()->set_quantization_info(weights_quantization_info);
-    }
-    else
-    {
-        // Configure matrix multiply kernel
-        _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info);
-    }
-}
-
-void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                              const FullyConnectedLayerInfo &fc_info)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
+    : _impl(std::make_unique<Impl>())
 {
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for flatten
-    TensorShape shape_flatten = compute_flatten_shape(input->info());
-    _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW));
-
-    // Configure flatten kernel
-    _memory_group.manage(&_flatten_output);
-    _flatten_layer.configure(compile_context, input, &_flatten_output);
-
-    // Configure matrix multiply kernel
-    configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info);
-
-    // Allocate the output tensor for flatten once all the configure methods have been called
-    _flatten_output.allocator()->allocate();
+    _impl->memory_group    = MemoryGroup(std::move(memory_manager));
+    _impl->weights_manager = weights_manager;
 }
 
-void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                            const FullyConnectedLayerInfo &fc_info)
-{
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+CLFullyConnectedLayer::~CLFullyConnectedLayer() = default;
 
-    // Configure matrix multiply kernel
-    configure_mm(compile_context, input, weights, bias, output, fc_info);
-}
-
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
 }
 
-void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
+    // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info));
 
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
-                                                               biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info));
+    _impl->op               = std::make_unique<opencl::ClFullyConnected>();
+    _impl->original_weights = weights;
+    _impl->is_prepared      = fc_info.retain_internal_weights;
 
-    _are_weights_converted = true;
-    _are_weights_reshaped  = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    _is_fc_after_conv      = true;
-    _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _is_prepared           = fc_info.retain_internal_weights;
-    _original_weights      = weights;
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
 
-    if(_weights_manager)
+    if (_impl->weights_manager != nullptr)
     {
-        _weights_manager->manage(weights);
+        _impl->weights_manager->manage(_impl->original_weights);
     }
 
-    const ICLTensor *weights_to_use = weights;
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-    if(is_batched_fc_layer)
+    if (!_impl->is_prepared)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                                  input->info()->tensor_shape().cend(),
-                                                                                  output->info()->tensor_shape().cbegin() + 1));
+        _impl->aux_mem_req = _impl->op->workspace();
+        _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
     else
     {
-        _is_fc_after_conv = input->info()->num_dimensions() > 1;
-    }
-
-    // Reshape weights if needed
-    if(!_are_weights_reshaped)
-    {
-        if(_weights_manager && _weights_manager->are_weights_managed(weights))
-        {
-            _reshape_weights_managed_function.configure(compile_context, weights);
-            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function));
-        }
-        else
-        {
-            // Reshape the weights
-            _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output);
-            weights_to_use = &_reshape_weights_output;
-        }
-    }
-
-    // Convert weights if needed
-    if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
-    {
-        if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
-        {
-            _convert_weights_managed.configure(compile_context, weights_to_use,
-                                               input->info()->tensor_shape(),
-                                               fc_info.weights_trained_layout);
-            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed));
-        }
-        else
-        {
-            // Convert weights
-            _convert_weights.configure(compile_context, weights_to_use,
-                                       &_converted_weights_output,
-                                       input->info()->tensor_shape(),
-                                       fc_info.weights_trained_layout);
-
-            weights_to_use = &_converted_weights_output;
-        }
-        _are_weights_converted = false;
+        _impl->run_pack.add_tensor(ACL_SRC_0, input);
+        _impl->run_pack.add_tensor(ACL_DST, output);
     }
 
-    if(_is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info);
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info);
-    }
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
                                        FullyConnectedLayerInfo fc_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-
-    bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    bool is_fc_after_conv = true;
-
-    const ITensorInfo &flatten_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    const ITensorInfo *input_to_use   = input;
-    const ITensorInfo *weights_to_use = weights;
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = output->dimension(1) > 1;
-    if(is_batched_fc_layer)
-    {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
-                                                                                 input->tensor_shape().cend(),
-                                                                                 output->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        is_fc_after_conv = input->num_dimensions() > 1;
-    }
-
-    if(!weights_reshaped)
-    {
-        // Validate reshape weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
-        weights_to_use = &reshaped_weights;
-    }
-
-    if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
-    {
-        // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                             &converted_weights,
-                                                                             input->tensor_shape(),
-                                                                             fc_info.weights_trained_layout));
-        weights_to_use = &converted_weights;
-    }
-
-    if(is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
-
-        // Validate flatten kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
-        input_to_use = &flatten_input;
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
-    }
-
-    // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
-
-    return Status{};
+    return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info);
 }
 
 void CLFullyConnectedLayer::run()
 {
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (!_impl->dynamic_weights)
     {
-        _flatten_layer.run();
+        prepare();
     }
 
-    // Run matrix multiply
-    if(_is_quantized)
-    {
-        _mm_gemmlowp.run();
-    }
-    else
-    {
-        _mm_gemm.run();
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
 void CLFullyConnectedLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        if(!_weights_manager)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-        }
-
-        auto release_unused = [](CLTensor * w)
-        {
-            if(!w->is_used())
-            {
-                CLScheduler::get().queue().finish();
-                w->allocator()->free();
-            }
-        };
+        _impl->op->prepare(_impl->run_pack);
 
-        // Pointer to current weights
-        const ICLTensor *cur_weights = _original_weights;
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<CLTensor>(_impl->aux_mem_req, _impl->workspace);
+        _impl->is_prepared = true;
 
-        // Reshape of the weights if needed (happens only once)
-        if(!_are_weights_reshaped)
+        // Handle weights managed infrastructure
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
-            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+            // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
+            // This is for cases where multiple functions share the same b (weights)
+            // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
+            const ITensor *original_b = _impl->original_weights;
+            if (!original_b->is_used())
             {
-                cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+                _impl->weights_manager->pre_mark_as_unused(original_b);
             }
-            else
-            {
-                // Run reshape weights kernel and mark weights as unused
-                _reshape_weights_output.allocator()->allocate();
-                _reshape_weights_function.run();
-
-                cur_weights->mark_as_unused();
-                cur_weights = &_reshape_weights_output;
-            }
-            _are_weights_reshaped = true;
+            _impl->original_weights->mark_as_used();
+            _impl->weights_manager->release(_impl->original_weights);
         }
-
-        // Convert weights if needed (happens only once)
-        if(!_are_weights_converted)
-        {
-            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
-            {
-                _weights_manager->run(cur_weights, &_convert_weights_managed);
-            }
-            else
-            {
-                _converted_weights_output.allocator()->allocate();
-                _convert_weights.run();
-                cur_weights->mark_as_unused();
-            }
-
-            _are_weights_converted = true;
-        }
-
-        // Release reshaped weights if unused
-        release_unused(&_reshape_weights_output);
-
-        // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized)
-        {
-            _mm_gemm.prepare();
-        }
-
-        // Release converted weights if unused
-        release_unused(&_reshape_weights_output);
-        release_unused(&_converted_weights_output);
-
-        _is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 6deecdc089..e4fbf78e13 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,39 +29,68 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+
 namespace arm_compute
 {
 CLFuseBatchNormalization::CLFuseBatchNormalization()
-    : _fuse_bn_kernel()
+    : _fuse_bn_kernel(std::make_unique<CLFuseBatchNormalizationKernel>())
 {
 }
 
-void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
+
+void CLFuseBatchNormalization::configure(const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const CLCompileContext    &compile_context,
+                                         const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
+    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
+                               bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void CLFuseBatchNormalization::run()
 {
-    CLScheduler::get().enqueue(_fuse_bn_kernel, true);
+    CLScheduler::get().enqueue(*_fuse_bn_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 8466024c04..871a1d6e27 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,675 +23,130 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClGemm.h"
 
 namespace arm_compute
 {
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-using namespace arm_compute::utils::cast;
+using namespace arm_compute::experimental;
+using OperatorType = opencl::ClGemm;
 
-CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(std::move(memory_manager)),
-      _weights_manager(weights_manager),
-      _mm_kernel(),
-      _reshape_lhs_kernel(),
-      _reshape_rhs_kernel(),
-      _reshape_rhs_kernel_managed(),
-      _mm_reshaped_kernel(),
-      _mm_reshaped_only_rhs_kernel(),
-      _tmp_a(),
-      _tmp_b(),
-      _original_b(nullptr),
-      _reshape_b_only_on_first_run(false),
-      _is_prepared(false),
-      _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1)
+struct CLGEMM::Impl
 {
-}
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
+    MemoryGroup                   memory_group{};
+    IWeightsManager              *weights_manager{nullptr};
+    ITensorPack                   run_pack{};
+    ITensorPack                   prep_pack{};
+    MemoryRequirements            aux_mem_req{};
+    WorkspaceData<CLTensor>       workspace_tensors{};
+    bool                          is_prepared{false};
+};
 
-CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+    : _impl(std::make_unique<Impl>())
 {
-    std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
-    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
-
-    CLGEMMKernelSelectionParams params;
-    params.m               = m;
-    params.n               = n;
-    params.k               = k;
-    params.is_rhs_constant = reshape_b_only_on_first_run;
-    params.data_type       = data_type;
-
-    return gemm_kernel->select_kernel(params);
+    _impl->memory_group    = MemoryGroup(memory_manager);
+    _impl->weights_manager = weights_manager;
 }
 
-void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
-                                 const GEMMInfo &gemm_info)
-{
-    const unsigned int m          = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const unsigned int n          = b->info()->dimension(0);
-    const unsigned int k          = a->info()->dimension(0);
-    const GPUTarget    gpu_target = CLScheduler::get().target();
-
-    // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
-
-    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
-
-    // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
-
-    // Tune kernel statically
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
-}
+CLGEMM::~CLGEMM() = default;
 
-void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
-                                   const GEMMInfo &gemm_info)
+void CLGEMM::configure(const ICLTensor *a,
+                       const ICLTensor *b,
+                       const ICLTensor *c,
+                       ICLTensor       *output,
+                       float            alpha,
+                       float            beta,
+                       const GEMMInfo  &gemm_info)
 {
-    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const unsigned int n                         = b->info()->dimension(0);
-    const unsigned int k                         = a->info()->dimension(0);
-    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target                = CLScheduler::get().target();
-    int                mult_transpose1xW_width   = 1;
-    int                mult_interleave4x4_height = 1;
-
-    // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
-
-    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
-    {
-        mult_transpose1xW_width   = 4;
-        mult_interleave4x4_height = 2;
-    }
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = 16 / b->info()->element_size();
-    rhs_info.k0         = 1;
-    rhs_info.h0         = mult_transpose1xW_width;
-    rhs_info.interleave = false;
-    rhs_info.transpose  = false;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = 4;
-    lhs_info.k0         = 4;
-    lhs_info.v0         = mult_interleave4x4_height;
-    lhs_info.interleave = true;
-    lhs_info.transpose  = true;
-
-    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
-
-    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp_a);
-
-    if(!_reshape_b_only_on_first_run && use_mm_b)
-    {
-        _memory_group.manage(&_tmp_b);
-    }
-
-    // Configure interleave kernel
-    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
-
-    // Configure transpose kernel
-    ICLTensor *reshaped_rhs = &_tmp_b;
-    if(_weights_manager && _weights_manager->are_weights_managed(b))
-    {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
-    }
-    else
-    {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
-    }
-
-    // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
-
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
-
-    // Allocate intermediate tensors
-    _tmp_a.allocator()->allocate();
-
-    if(!_reshape_b_only_on_first_run && use_mm_b)
-    {
-        _tmp_b.allocator()->allocate();
-    }
-}
-
-void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
-                                   const GEMMInfo &gemm_info)
-{
-    DataType           data_type               = a->info()->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const unsigned int n                       = b->info()->dimension(0);
-    const unsigned int k                       = a->info()->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = false;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
-
-    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp_a);
-
-    if(!_reshape_b_only_on_first_run && use_mm_b)
-    {
-        _memory_group.manage(&_tmp_b);
-    }
-
-    // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
-    GEMMLHSMatrixInfo lhs_info{};
-    GEMMRHSMatrixInfo rhs_info{};
-
-    // Pick up the GEMM configuration
-    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
-
-    // Configure lhs_info and rhs_info
-    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
-
-    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-
-    ICLTensor *reshaped_rhs = &_tmp_b;
-    if(_weights_manager && _weights_manager->are_weights_managed(b))
-    {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
-    }
-    else
-    {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
-    }
-
-    // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
-    // Allocate intermediate tensors
-    _tmp_a.allocator()->allocate();
-
-    if(!_reshape_b_only_on_first_run && use_mm_b)
-    {
-        _tmp_b.allocator()->allocate();
-    }
+    configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
 }
 
-void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
-                                         const GEMMInfo &gemm_info)
+void CLGEMM::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *a,
+                       const ICLTensor        *b,
+                       const ICLTensor        *c,
+                       ICLTensor              *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
-    DataType           data_type               = a->info()->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const unsigned int n                       = b->info()->dimension(0);
-    const unsigned int k                       = a->info()->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
-
-    const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
-
-    // Manage intermediate buffers
-    if(!_reshape_b_only_on_first_run && use_mm_b)
-    {
-        _memory_group.manage(&_tmp_b);
-    }
-
-    GEMMLHSMatrixInfo lhs_info{};
-    GEMMRHSMatrixInfo rhs_info{};
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
-    // Pick up the GEMM configuration
-    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+    _impl->b           = b;
+    _impl->op          = std::make_unique<OperatorType>();
+    _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    // Configure lhs_info and rhs_info
-    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         alpha, beta, gemm_info);
+    _impl->aux_mem_req = _impl->op->workspace();
 
-    ICLTensor *reshaped_rhs = &_tmp_b;
-    if(_weights_manager && _weights_manager->are_weights_managed(b))
+    // Manage/allocate auxilairy tensors
+    if (_impl->is_prepared)
     {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+        _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
+        _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
-    }
+        _impl->run_pack  = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
 
-    // Configure and tune matrix multiply kernel
-    _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
-    if(!_reshape_b_only_on_first_run && use_mm_b)
-    {
-        _tmp_b.allocator()->allocate();
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLGEMM::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    // Get the GPU target
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());
-
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta,
-                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
-
-    return Status{};
+    return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info);
 }
 
-Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
-
-    // Get the GPU target
-    const GPUTarget    gpu_target                = CLScheduler::get().target();
-    const unsigned int m                         = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                         = b->dimension(0);
-    const unsigned int k                         = a->dimension(0);
-    int                mult_transpose1xW_width   = 1;
-    int                mult_interleave4x4_height = 1;
-    const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-
-    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
-    {
-        mult_transpose1xW_width   = 4;
-        mult_interleave4x4_height = 2;
-    }
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = 16 / b->element_size();
-    rhs_info.k0         = 1;
-    rhs_info.h0         = mult_transpose1xW_width;
-    rhs_info.interleave = false;
-    rhs_info.transpose  = false;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = 4;
-    lhs_info.k0         = 4;
-    lhs_info.v0         = mult_interleave4x4_height;
-    lhs_info.interleave = true;
-    lhs_info.transpose  = true;
-
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
-
-    // Validate interleave kernel
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-    // Validate transpose kernel
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,
-                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
-
-    return Status{};
-}
-
-Status CLGEMM::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
-
-    // Get the GPU target
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    DataType           data_type               = a->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = false;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Pick up the GEMM configuration
-    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
-
-    // Configure lhs_info and rhs_info
-    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
-
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
-    return Status{};
-}
-
-Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(output);
-
-    TensorInfo tmp_b_info{};
-
-    // Get the GPU target
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    const DataType     data_type               = a->data_type();
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = m;
-    kernel_info.n                       = n;
-    kernel_info.k                       = k;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = gemm_info.activation_info();
-
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Pick up the GEMM configuration
-    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
-
-    // Configure lhs_info and rhs_info
-    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
-
-    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
-
-    // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
-
-    return Status{};
-}
-
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
-}
-
-void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = gemm_info.retain_internal_weights();
-    _original_b                  = b;
-
-    // Get the GPU target
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const unsigned int n                       = b->info()->dimension(0);
-    const unsigned int k                       = a->info()->dimension(0);
-
-    // Select GEMMType
-    _gemm_kernel_type = select_gemm_kernel(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
-
-    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
-
-    const ICLTensor *c_to_use = fuse_add_c ? c : nullptr;
-
-    switch(_gemm_kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE_V1:
-        {
-            configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_V1:
-        {
-            configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED:
-        {
-            configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        {
-            configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("GEMMType not supported");
-        }
-    }
-}
-
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::run()
 {
-    // Get the GPU target
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-
-    // Select GEMMType
-    CLGEMMKernelType gemm_kernel_type = select_gemm_kernel(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
-
-    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
-
-    const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+    prepare();
 
-    switch(gemm_kernel_type)
-    {
-        case CLGEMMKernelType::NATIVE_V1:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_V1:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
-        }
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    return Status{};
+    _impl->op->run(_impl->run_pack);
 }
 
-void CLGEMM::run()
+void CLGEMM::prepare()
 {
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run matrix multiply kernel
-    switch(_gemm_kernel_type)
+    if (!_impl->is_prepared)
     {
-        case CLGEMMKernelType::NATIVE_V1:
-        {
-            CLScheduler::get().enqueue(_mm_kernel, true);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED_V1:
-        {
-            // Run interleave kernel
-            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+        _impl->op->prepare(_impl->prep_pack);
 
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
-                {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
-                }
-                else
-                {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-                }
-            }
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-            CLScheduler::get().enqueue(_mm_kernel, true);
-            break;
-        }
-        case CLGEMMKernelType::RESHAPED:
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
-            // Run interleave kernel
-            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
-
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
-                {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
-                }
-                else
-                {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-                }
-            }
-
-            CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
-            break;
+            _impl->b->mark_as_unused();
         }
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-        {
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
-                {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
-                }
-                else
-                {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-                }
-            }
-
-            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("GEMMType not supported");
-        }
-    }
-}
-
-void CLGEMM::prepare()
-{
-    if(!_is_prepared)
-    {
-        if(_gemm_kernel_type != CLGEMMKernelType::NATIVE_V1 && _reshape_b_only_on_first_run)
+        else
         {
-            if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
-            {
-                _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
-            }
-            else
-            {
-                // Run transpose kernel and mark original weights tensor as unused
-                _tmp_b.allocator()->allocate();
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
-                _original_b->mark_as_unused();
-            }
+            // Pack the B matrix to be used as the underlying GEMM performs no reshapes
+            _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->b);
         }
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 1c37993bda..aef7cddd7a 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
+#include "support/Cast.h"
+
 #include <cmath>
 #include <memory>
 #include <tuple>
@@ -40,635 +44,117 @@ namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::cast;
+using namespace arm_compute::experimental;
 
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel()
-{
-}
-
-void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups);
-}
-
-void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
-{
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(),
-                                                                          (biases != nullptr) ? biases->info() : nullptr,
-                                                                          output->info(),
-                                                                          num_groups));
-
-    const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
-
-    _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups);
-
-    output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
+struct CLGEMMConvolutionLayer::Impl
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(weights->data_type()));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-        CLWeightsReshapeKernel::validate(weights, biases, output, num_groups);
-    }
-
-    return Status{};
-}
-
-void CLConvolutionLayerReshapeWeights::run()
+    const ITensor                        *weights{nullptr};
+    std::unique_ptr<opencl::ClGemmConv2d> op{nullptr};
+    ITensorPack                           run_pack{};
+    ITensorPack                           prep_pack{};
+    MemoryGroup                           memory_group{};
+    IWeightsManager                      *weights_manager{nullptr};
+    MemoryRequirements                    aux_mem_req{};
+    WorkspaceData<CLTensor>               workspace_tensors{};
+    bool                                  is_prepared{false};
+};
+
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                               IWeightsManager                *weights_manager)
+    : _impl(std::make_unique<Impl>())
 {
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
-}
-
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager, weights_manager),
-      _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false),
-      _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
-{
-}
-
-void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                          const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                          int gemm_3d_depth, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
-                                         false,                 // is_b_reshaped
-                                         true,                  // reshape_b_only_on_first_run
-                                         gemm_3d_depth,         // depth_output_gemm3d
-                                         _skip_im2col,          // reinterpret_input_as_3d
-                                         false,                 // retain_internal_weights
-                                         gemmlowp_output_stage, // gemmlowp_output_stage
-                                         false,                 // fp_mixed_precision
-                                         true,                  // broadcast_bias
-                                         act_info);             // activation_info
-
-    if(_is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
-        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-        _mm_gemmlowp.configure(compile_context, input, weights, biases, output, gemm_info);
-
-        // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
-        input->info()->set_quantization_info(input_quantization_info);
-        weights->info()->set_quantization_info(weights_quantization_info);
-    }
-    else
-    {
-        // Configure matrix multiply function
-        _mm_gemm.configure(compile_context, input, weights, biases, output, 1.0f, 1.0f, gemm_info);
-    }
-}
-
-Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
-{
-    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
-
-    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
-                                         false,                 // is_b_reshaped
-                                         true,                  // reshape_b_only_on_first_run
-                                         gemm_3d_depth,         // depth_output_gemm3d
-                                         skip_im2col,           // reinterpret_input_as_3d
-                                         false,                 // retain_internal_weights
-                                         gemmlowp_output_stage, // gemmlowp_output_stage
-                                         false,                 // fp_mixed_precision
-                                         true,                  // broadcast_bias
-                                         act_info);             // activation_info
-
-    if(is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info   = input->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->quantization_info();
-
-        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
-        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-        // Perform validation step on GEMMLowp
-        return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info);
-    }
-    else
-    {
-        // Perform validation step on Matrix multiply function
-        return CLGEMM::validate(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
-    }
+    _impl->memory_group    = MemoryGroup(memory_manager);
+    _impl->weights_manager = weights_manager;
 }
 
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
+
+void CLGEMMConvolutionLayer::configure(const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, num_groups);
 }
 
-void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                       const PadStrideInfo &conv_info,
-                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                       const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMConvolutionLayer::validate(input->info(),
-                                                                weights->info(),
-                                                                biases != nullptr ? biases->info() : nullptr,
-                                                                output->info(),
-                                                                conv_info,
-                                                                weights_info,
-                                                                dilation,
-                                                                act_info,
-                                                                num_groups));
-
-    const DataType   data_type   = input->info()->data_type();
-    const DataLayout data_layout = input->info()->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->info()->dimension(idx_width);
-    const unsigned int kernel_height = weights->info()->dimension(idx_height);
-    const unsigned int num_kernels   = weights->info()->dimension(idx_kernels);
-
-    const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
-    _is_prepared      = weights_info.retain_internal_weights();
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    _skip_col2im      = data_layout == DataLayout::NHWC;
-
-    // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
-    _fuse_activation = true;
-
-    // Set the GPU target for im2col and col2im
-    _im2col_kernel.set_target(CLScheduler::get().target());
-    _col2im_kernel.set_target(CLScheduler::get().target());
-
-    const ICLTensor *gemm_input_to_use  = input;
-    ICLTensor       *gemm_output_to_use = output;
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
-                                                 input->info()->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-
-    unsigned int mat_weights_cols = num_kernels / num_groups;
-
-    const ICLTensor *biases_to_use = biases;
-    bool             append_bias   = false;
-
-    ICLTensor *weights_to_use = &_weights_reshaped;
-    if(num_groups != 1 && biases != nullptr)
-    {
-        // num_groups != 1 can only be for NCHW
-        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use = nullptr;
-        append_bias   = true;
-
-        if(_weights_manager && _weights_manager->are_weights_managed(weights))
-        {
-            _reshape_weights_managed.configure(compile_context, weights, biases, num_groups);
-            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
-        }
-        else
-        {
-            _reshape_weights.configure(compile_context, weights, biases, &_weights_reshaped, num_groups);
-        }
-    }
-    else
-    {
-        if(_weights_manager && _weights_manager->are_weights_managed(weights))
-        {
-            _reshape_weights_managed.configure(compile_context, weights, nullptr, num_groups);
-            weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
-        }
-        else
-        {
-            _reshape_weights.configure(compile_context, weights, nullptr, &_weights_reshaped, num_groups);
-        }
-    }
-
-    // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
-    {
-        _memory_group.manage(&_im2col_output);
-
-        // Configure and tune im2col. im2col output shape is auto-initialized
-        _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
-
-        // Set quantization info
-        _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
-        CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
-        // Update GEMM input
-        gemm_input_to_use = &_im2col_output;
-    }
-
-    // Create GEMM output tensor
-    if(!_skip_col2im)
-    {
-        TensorShape shape_gemm;
-
-        // If we cannot skip col2im it means we run im2col as well
-        shape_gemm = _im2col_output.info()->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
-        // TODO(COMPMID-2078): input->clone() doesn't work with subtensors for grouped convolutions.
-        TensorInfo info_gemm(shape_gemm, 1, data_type);
-        info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
-        _gemm_output.allocator()->init(info_gemm);
-        _memory_group.manage(&_gemm_output);
-
-        // Update GEMM output
-        gemm_output_to_use = &_gemm_output;
-    }
-
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    gemmlowp_output_stage.type            = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset = 0;
-
-    // Configure output stage for quantized case
-    if(_is_quantized)
-    {
-        const auto         output_quant_info        = (output->info()->total_size() == 0) ? iq_info : oq_info;
-        const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
-        const unsigned int num_filters              = (is_quantized_per_channel) ? num_kernels : 1;
-
-        gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
-
-        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
-        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
-        quantization::compute_quantized_multipliers_and_shifts(input->info(),
-                                                               weights->info(),
-                                                               output->info(),
-                                                               idx_kernels,
-                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
-                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
-        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
-        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
-
-        PixelValue min_val{};
-        PixelValue max_val{};
-        std::tie(min_val, max_val) = get_min_max(output->info()->data_type());
-
-        auto min_activation = min_val.get<int32_t>();
-        auto max_activation = max_val.get<int32_t>();
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-
-        if(act_info.enabled())
-        {
-            if(supported_acts.count(act_info.activation()) != 0)
-            {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info);
-            }
-            else
-            {
-                _fuse_activation = false;
-            }
-        }
-
-        // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
-        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
-    }
-
-    // Configure and tune GEMM
-    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
-    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
-
-    configure_mm(compile_context, gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
-
-    if(!_skip_im2col)
-    {
-        _im2col_output.allocator()->allocate();
-    }
-
-    if(!_skip_col2im)
-    {
-        // Configure and tune Col2Im
-        _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
-        CLScheduler::get().tune_kernel_static(_col2im_kernel);
-    }
-
-    if(!_skip_col2im)
-    {
-        _gemm_output.allocator()->allocate();
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
-                             "Output shape does not match the expected one");
-
-    if(!_fuse_activation)
-    {
-        _activationlayer_function.configure(compile_context, output, nullptr, act_info);
-    }
-
-    ARM_COMPUTE_UNUSED(weights_info);
+    _impl->weights               = weights;
+    _impl->op                    = std::make_unique<opencl::ClGemmConv2d>();
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input},
+                        {TensorType::ACL_SRC_1, weights},
+                        {TensorType::ACL_SRC_2, biases},
+                        {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {
+        {TensorType::ACL_SRC_1, weights},
+        {TensorType::ACL_SRC_2, biases},
+    };
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        unsigned int               num_groups)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
-
-    if(is_quantized_per_channel)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != DataType::QASYMM8, "Input data type not compatible with Weights");
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(2) / weights->dimension(2)) != num_groups) && (input->data_layout() == DataLayout::NCHW));
-
-    const DataLayout data_layout = input->data_layout();
-    const DataType   data_type   = input->data_type();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->dimension(idx_width);
-    const unsigned int kernel_height = weights->dimension(idx_height);
-    const unsigned int num_kernels   = weights->dimension(idx_kernels);
-
-    TensorInfo         im2col_reshaped_info{};
-    TensorInfo         info_gemm{};
-    TensorInfo         weights_reshaped_info{};
-    const ITensorInfo *gemm_input_to_use  = input;
-    const ITensorInfo *gemm_output_to_use = output;
-    const ITensorInfo *weights_to_use     = weights;
-    const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
-    const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    const bool         skip_col2im        = data_layout == DataLayout::NHWC;
-    bool               fuse_activation    = true;
-
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    // Validate biases
-    if(biases != nullptr)
-    {
-        if(is_quantized)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
-    }
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
-                                                 input->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-
-    unsigned int mat_weights_cols = num_kernels / num_groups;
-
-    const ITensorInfo *biases_to_use = biases;
-    bool               append_bias   = false;
-
-    if(num_groups != 1 && biases != nullptr)
-    {
-        // num_groups != 1 can only be for NCHW
-        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use = nullptr;
-        append_bias   = true;
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr, num_groups));
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, num_groups), 1, data_type);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr, num_groups));
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, num_groups), 1, data_type);
-    }
-
-    weights_to_use = &weights_reshaped_info;
-
-    if(!skip_im2col)
-    {
-        const Size2D kernel_dims(kernel_width, kernel_height);
-
-        // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, num_groups == 1, num_groups);
-
-        auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
-        gemm_input_to_use = &im2col_reshaped_info;
-    }
-
-    // Create GEMM output tensor
-    if(!skip_col2im)
-    {
-        TensorShape shape_gemm;
-
-        shape_gemm = gemm_input_to_use->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
-        info_gemm = TensorInfo(shape_gemm, 1, data_type);
-        info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
-        gemm_output_to_use = &info_gemm;
-    }
-
-    GEMMLowpOutputStageInfo gemmlowp_output_stage;
-    gemmlowp_output_stage.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage.gemmlowp_offset          = 0;
-    gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
-
-    if(is_quantized)
-    {
-        const UniformQuantizationInfo iq_info           = input->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info           = output->quantization_info().uniform();
-        const auto                    output_quant_info = (output->total_size() == 0) ? iq_info : oq_info;
-        const unsigned int            num_filters       = (is_quantized_per_channel) ? num_kernels : 1;
-
-        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
-        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
-        quantization::compute_quantized_multipliers_and_shifts(input,
-                                                               weights,
-                                                               output,
-                                                               idx_kernels,
-                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
-                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
-        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
-        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
-
-        int min_activation = 0;
-        int max_activation = 0;
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-
-        if(act_info.enabled())
-        {
-            if(supported_acts.count(act_info.activation()) != 0)
-            {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, output_quant_info);
-            }
-            else
-            {
-                fuse_activation = false;
-            }
-        }
-
-        // Set the GEMMLowp output stage info
-        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
-        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
-        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
-    }
-
-    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
-    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, act_info));
-
-    // Validate Col2Im
-    if(!skip_col2im)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups));
-    }
-
-    //Validate Activation Layer
-    if(!fuse_activation)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
-    }
-
-    return Status{};
+    const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
+    return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
 }
 
 void CLGEMMConvolutionLayer::run()
 {
     prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run im2col
-    if(!_skip_im2col)
-    {
-        CLScheduler::get().enqueue(_im2col_kernel);
-    }
-
-    // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
-    if(_is_quantized)
-    {
-        // Run gemmlowp
-        _mm_gemmlowp.run();
-    }
-    else
-    {
-        // Run gemm
-        _mm_gemm.run();
-    }
-
-    // Reshape output matrix
-    if(!_skip_col2im)
-    {
-        CLScheduler::get().enqueue(_col2im_kernel, false);
-    }
-
-    //Run Activation Layer if we cannot fuse in GEMM
-    if(!_fuse_activation)
-    {
-        _activationlayer_function.run();
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
 void CLGEMMConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-        if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+        _impl->op->prepare(_impl->prep_pack);
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
-            _weights_manager->run(_original_weights, &_reshape_weights_managed);
+            _impl->weights->mark_as_unused();
         }
         else
         {
-            // Run weights reshaping and mark original weights tensor as unused
-            _weights_reshaped.allocator()->allocate();
-            _reshape_weights.run();
-            _original_weights->mark_as_unused();
-        }
-
-        // Prepare GEMM
-        _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
-        if(!_weights_reshaped.is_used())
-        {
-            _weights_reshaped.allocator()->free();
+            // Pack the B matrix to be used as the underlying GEMM performs no reshapes
+            _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights);
         }
-
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
+        release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors);
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 1dcb341fe7..7d40cf1829 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,24 +24,29 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <memory>
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
 #include <tuple>
 
 namespace arm_compute
 {
 namespace
 {
-std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+std::pair<Coordinates, Coordinates>
+compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
 {
     Coordinates start;
     Coordinates end;
 
-    if(is_nchw)
+    if (is_nchw)
     {
         start.set(0, deconv_info.pad_left());
         start.set(1, deconv_info.pad_top());
@@ -59,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT
         end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
     }
 
-    return { start, end };
+    return {start, end};
 }
-Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       *input,
+                                       const ITensorInfo       *weights,
+                                       const ITensorInfo       *output,
+                                       GEMMLowpOutputStageInfo &output_stage_info)
 {
     const auto data_type = input->data_type();
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
@@ -74,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn
         float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
         int   output_shift(0);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
         output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -99,7 +108,7 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
       _permute_weights_to_nhwc(),
       _reshape_weights(),
       _transpose_weights(),
-      _deconv_reshape(),
+      _deconv_reshape(std::make_unique<CLDeconvolutionReshapeOutputKernel>()),
       _slice_gemm(),
       _gemmlowp_final(),
       _reshaped_weights(),
@@ -116,15 +125,23 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
 {
 }
 
-Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
+
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                          const ITensorInfo   *weights,
+                                          const ITensorInfo   *bias,
+                                          const ITensorInfo   *output,
+                                          const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
 
     DataLayout data_layout  = input->data_layout();
-    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 ||
+                              deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
     const bool is_nchw      = input->data_layout() == DataLayout::NCHW;
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -138,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     TensorShape nhwc_weights_shape = weights->tensor_shape();
     TensorShape nhwc_input_shape   = input->tensor_shape();
 
-    if(is_nchw)
+    if (is_nchw)
     {
         permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
         permute(nhwc_input_shape, PermutationVector(2, 0, 1));
 
-        TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_input_info = input->clone()
+                                         ->set_is_resizable(true)
+                                         .reset_padding()
+                                         .set_tensor_shape(nhwc_input_shape)
+                                         .set_data_layout(DataLayout::NCHW);
 
-        TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_weights_info = weights->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(nhwc_weights_shape)
+                                           .set_data_layout(DataLayout::NCHW);
 
         CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
         CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
     }
 
-    const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
-    const TensorInfo  reshaped_info  = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+    const TensorShape reshaped_shape =
+        TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+    const TensorInfo reshaped_info =
+        weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
 
     TensorShape      transposed_shape(reshaped_shape[1], reshaped_shape[0]);
@@ -160,76 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
 
     TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
-                                  input->dimension(idx_w),
-                                  input->dimension(idx_h),
-                                  input->dimension(idx_b));
+                                  input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b));
 
     TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
     GEMMInfo   gemm_info(false, false, true, input->dimension(idx_h), true);
 
     GEMMLowpOutputStageInfo output_stage_info;
 
-    if(is_quantized)
+    if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
-                                                                           gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+            &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr,
+            &gemm_output_info.set_data_type(DataType::S32), gemm_info));
         ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true),
+                             &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
     }
 
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-    auto                out_dims           = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
-    const TensorShape   deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
-    TensorInfo          col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+    auto                out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                                   weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+    const TensorShape   deconv_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
 
-    if(padded_input && is_quantized)
+    if (padded_input && is_quantized)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(
+            &col2im_output_info, nullptr,
+            &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()),
+                              output, start_end.first, start_end.second));
     }
-    else if(padded_input)
+    else if (padded_input)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
     }
-    else if(is_quantized)
+    else if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
     }
 
     return Status{};
 }
 
-void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor     *input,
+                                         const ICLTensor     *weights,
+                                         const ICLTensor     *bias,
+                                         ICLTensor           *output,
+                                         const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
 }
 
-void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                         const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *weights,
+                                         const ICLTensor        *bias,
+                                         ICLTensor              *output,
+                                         const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
-                                                                  weights->info(),
-                                                                  bias != nullptr ? bias->info() : nullptr,
-                                                                  output->info(),
-                                                                  deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info);
 
     _original_weights = weights;
-    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
-    _is_nchw          = input->info()->data_layout() == DataLayout::NCHW;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 ||
+                    deconv_info.pad_top() > 0;
+    _is_nchw      = input->info()->data_layout() == DataLayout::NCHW;
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const ICLTensor *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
@@ -238,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     // do an outer product in NCHW and then an accumulation through a reduction. This would have two
     // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
     // might be slower than GEMM.
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
@@ -250,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
-    _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
-                                                               weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
-                                                   1,
-                                                   input->info()->data_type(), weights->info()->quantization_info()));
+    _reshaped_weights.allocator()->init(
+        TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) *
+                                                                         weights_to_use->info()->dimension(2) *
+                                                                         weights_to_use->info()->dimension(3)),
+                   1, input->info()->data_type(), weights->info()->quantization_info()));
 
     _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
     _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
@@ -262,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
 
     // Configure output stage for asymmetric quantized types
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original
         // and restore them back to make it work properly.
         QuantizationInfo iq_info = input->info()->quantization_info();
         QuantizationInfo wq_info = weights->info()->quantization_info();
 
-        input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
-        _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
+        input_to_use->info()->set_quantization_info(
+            QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
+        _reshaped_weights_t.info()->set_quantization_info(
+            QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
 
         _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
 
@@ -279,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
     else
     {
-        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f,
+                           gemm_info);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -291,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     ICLTensor *slice_output          = nullptr;
     ICLTensor *output_stage_output   = nullptr;
 
-    if(_padded_input && _is_quantized)
+    if (_padded_input && _is_quantized)
     {
         _memory_group.manage(&_slice_gemm_input);
         _memory_group.manage(&_gemmlowp_final);
@@ -299,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
         output_stage_output   = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_padded_input)
+    else if (_padded_input)
     {
         _memory_group.manage(&_slice_gemm_input);
         deconv_reshape_output = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_is_quantized)
+    else if (_is_quantized)
     {
         _memory_group.manage(&_gemmlowp_final);
         deconv_reshape_output = &_gemmlowp_final;
@@ -317,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape.configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(),
+                               weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         GEMMLowpOutputStageInfo output_stage_info;
         construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
-        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output,
+                                         output_stage_info);
         _gemmlowp_final.allocator()->allocate();
     }
 
     // If the input was padded, the output needs to be sliced.
-    if(_padded_input)
+    if (_padded_input)
     {
-        const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+        const auto start_end =
+            compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
         _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
         _slice_gemm_input.allocator()->allocate();
     }
@@ -343,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input_to_nhwc.run();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _mm_gemmlowp.run();
     }
@@ -357,14 +410,14 @@ void CLGEMMDeconvolutionLayer::run()
         _mm_gemm.run();
     }
 
-    CLScheduler::get().enqueue(_deconv_reshape, false);
+    CLScheduler::get().enqueue(*_deconv_reshape, false);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _gemmlowp_output_stage.run();
     }
 
-    if(_padded_input)
+    if (_padded_input)
     {
         _slice_gemm.run();
     }
@@ -372,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run()
 
 void CLGEMMDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->allocate();
             _permute_weights_to_nhwc.run();
@@ -385,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         _reshaped_weights.allocator()->allocate();
         _reshape_weights.run();
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->free();
         }
@@ -394,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         _transpose_weights.run();
 
         // Prepare gemm
-        if(!_is_quantized)
+        if (!_is_quantized)
         {
             _mm_gemm.prepare();
         }
@@ -404,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         }
 
         // Free resources
-        if(!_reshaped_weights_t.is_used())
+        if (!_reshaped_weights_t.is_used())
         {
             _reshaped_weights_t.allocator()->free();
         }
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 84da4a7e98..8bad198658 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,537 +23,111 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Log.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
 {
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
+using namespace arm_compute::experimental;
+using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore;
 
-namespace
-{
-inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
+struct CLGEMMLowpMatrixMultiplyCore::Impl
 {
-    std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
-    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
-
-    CLGEMMKernelSelectionParams params;
-    params.m               = m;
-    params.n               = n;
-    params.k               = k;
-    params.is_rhs_constant = reshape_b_only_on_first_run;
-    params.data_type       = data_type;
-
-    switch(gemm_kernel->select_kernel(params))
-    {
-        case CLGEMMKernelType::NATIVE:
-            return false;
-        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
-            return true;
-        default:
-            ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
-    }
-}
-} // namespace
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
+    MemoryGroup                   memory_group{};
+    ITensorPack                   run_pack{};
+    MemoryRequirements            aux_mem_req{};
+    WorkspaceData<CLTensor>       workspace_tensors{};
+    bool                          is_prepared{false};
+};
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _weights_to_qasymm8(),
-      _mm_native_kernel(),
-      _mm_reshaped_only_rhs_kernel(),
-      _mtx_b_reshape_kernel(),
-      _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(),
-      _offset_contribution_output_stage_kernel(),
-      _qasymm8_weights(),
-      _vector_sum_col(),
-      _vector_sum_row(),
-      _tmp_b(),
-      _mm_result_s32(),
-      _gemm_output_stage_multipliers(),
-      _gemm_output_stage_shifts(),
-      _matrix_a(nullptr),
-      _original_b(nullptr),
-      _output(nullptr),
-      _a_offset(0),
-      _b_offset(0),
-      _is_gemm_reshaped(true),
-      _reshape_b_only_on_first_run(false),
-      _is_prepared(false),
-      _run_output_stage(false),
-      _convert_to_qasymm8(false),
-      _run_offset_contribution(false)
+    : _impl(std::make_unique<Impl>())
 {
+    _impl->memory_group = MemoryGroup(memory_manager);
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
+
+void CLGEMMLowpMatrixMultiplyCore::configure(
+    const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+                                             const ICLTensor        *a,
+                                             const ICLTensor        *b,
+                                             const ICLTensor        *c,
+                                             ICLTensor              *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
-    _is_prepared                 = false;
-    _original_b                  = b;
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _a_offset                    = a->info()->quantization_info().uniform().offset;
-    _matrix_a                    = a;
-    _output                      = output;
-
-    _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
-                          && is_data_type_quantized_asymmetric(a->info()->data_type());
-    _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
-
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    // Set the target for the kernels
-    _mm_native_kernel.set_target(gpu_target);
-    _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
-
-    GEMMRHSMatrixInfo rhs_info;
-    GEMMLHSMatrixInfo lhs_info;
 
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
-    const unsigned int n                       = b->info()->dimension(0);
-    const unsigned int k                       = a->info()->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    _impl->b           = b;
+    _impl->op          = std::make_unique<OperatorType>();
+    _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    // Check if we need to reshape the matrix A and matrix B
-    _is_gemm_reshaped = is_gemm_reshaped(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         gemm_info);
+    _impl->aux_mem_req = _impl->op->workspace();
 
-    if(_convert_to_qasymm8)
+    // Manage/allocate auxilairy tensors
+    if (_impl->is_prepared)
     {
-        // Set data type for converted weights
-        TensorInfo weights_info(*b->info());
-        weights_info.set_data_type(DataType::QASYMM8);
-        _qasymm8_weights.allocator()->init(weights_info);
-        _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
-    }
-
-    const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_is_gemm_reshaped)
-    {
-        matrix_b = &_tmp_b;
-
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-
-        // Pick up the GEMM configuration
-        // Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED doesn't matter, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
-        // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
-    }
-
-    // Using default reduction info
-    const GEMMLowpReductionKernelInfo reduction_info {};
-
-    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
-    {
-        TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-        _vector_sum_col.allocator()->init(info_vector_sum_col);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_vector_sum_col);
-        }
-
-        // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
-    }
-
-    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
-    {
-        TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
-        _vector_sum_row.allocator()->init(info_vector_sum_row);
-        _memory_group.manage(&_vector_sum_row);
-
-        // Configure matrix A reduction kernel
-        _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info);
-    }
-
-    GEMMKernelInfo gemm_kernel_info;
-    gemm_kernel_info.m                       = m;
-    gemm_kernel_info.n                       = n;
-    gemm_kernel_info.k                       = k;
-    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    gemm_kernel_info.lhs_info                = lhs_info;
-    gemm_kernel_info.rhs_info                = rhs_info;
-    gemm_kernel_info.a_offset                = _a_offset;
-    gemm_kernel_info.b_offset                = _b_offset;
-    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-    {
-        // Configure offset contribution kernel
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
-
-        _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-        _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-
-        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
-        gemmlowp_output_stage.output_data_type        = _matrix_a->info()->data_type();
-
-        gemm_kernel_info.output_stage = gemmlowp_output_stage;
-
-        if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-        {
-            // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                   _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
-        }
-        else
-        {
-            _run_output_stage = true;
-
-            _memory_group.manage(&_mm_result_s32);
-
-            if(_is_gemm_reshaped)
-            {
-                _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
-            }
-            else
-            {
-                // Pick up the GEMM configuration
-                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
-                // Configure matrix multiply kernel
-                _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
-
-                _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
-                                                                   a->info()->dimension(0),
-                                                                   _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
-                _mm_result_s32.allocator()->allocate();
-            }
-        }
-
-        _gemm_output_stage_multipliers.allocator()->allocate();
-        _gemm_output_stage_shifts.allocator()->allocate();
-        // Compute GEMM output multipliers and shifts for output stage
-        _gemm_output_stage_multipliers.map();
-        _gemm_output_stage_shifts.map();
-        std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
-        std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
-        _gemm_output_stage_multipliers.unmap();
-        _gemm_output_stage_shifts.unmap();
+        _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
+        _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _run_offset_contribution = true;
-        if(_is_gemm_reshaped)
-        {
-            // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
-        }
-        else
-        {
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
-            // Configure matrix multiply kernel
-            _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
-        }
-
-        // Configure offset contribution kernel
-        _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
-                                              _b_offset);
-    }
-
-    // Allocate tensors
-    if(_is_gemm_reshaped)
-    {
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
-    }
-
-    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
-    {
-        _vector_sum_col.allocator()->allocate();
-    }
-
-    if(_b_offset != 0)
-    {
-        _vector_sum_row.allocator()->allocate();
+        _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
 }
 
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    int32_t a_offset = a->quantization_info().uniform().offset;
-    int32_t b_offset = b->quantization_info().uniform().offset;
-
-    const ITensorInfo *matrix_a_info = a;
-
-    TensorInfo        tmp_b_info{};
-    GEMMRHSMatrixInfo rhs_info;
-    GEMMLHSMatrixInfo lhs_info;
-
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-
-    bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
-
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
-    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                              && is_data_type_quantized_asymmetric(a->data_type());
-    TensorInfo weights_info(*b);
-    if(convert_to_qasymm8)
-    {
-        b_offset = -128;
-        weights_info.set_data_type(DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
-    }
-    const ITensorInfo *matrix_b_info = &weights_info;
-    if(reshape_matrix_b)
-    {
-        matrix_b_info = &tmp_b_info;
-
-        // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
-        // Validate reshape RHS kernel
-        auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
-    }
-
-    TensorInfo info_vector_sum_col{};
-    TensorInfo info_vector_sum_row{};
-
-    const GEMMLowpReductionKernelInfo reduction_info;
-    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if(a_offset != 0)
-    {
-        info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
-
-        // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
-    }
-
-    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(b_offset != 0)
-    {
-        info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
-        // Configure matrix A reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
-    }
-
-    GEMMKernelInfo gemm_kernel_info;
-    gemm_kernel_info.m                       = m;
-    gemm_kernel_info.n                       = n;
-    gemm_kernel_info.k                       = k;
-    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
-    gemm_kernel_info.lhs_info                = lhs_info;
-    gemm_kernel_info.rhs_info                = rhs_info;
-    gemm_kernel_info.a_offset                = a_offset;
-    gemm_kernel_info.b_offset                = b_offset;
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-    {
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
-
-        const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
-
-        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
-        gemmlowp_output_stage.output_data_type        = a->data_type();
-
-        gemm_kernel_info.output_stage = gemmlowp_output_stage;
-        if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
-        }
-        else
-        {
-            TensorInfo mm_result_s32_info{};
-
-            if(reshape_matrix_b)
-            {
-                // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
-
-                // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
-            }
-            else
-            {
-                // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
-
-                // Pick up the GEMM configuration
-                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
-                // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
-            }
-
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                output,
-                                                                                                a_offset, b_offset,
-                                                                                                gemmlowp_output_stage,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
-        }
-    }
-    else
-    {
-        if(reshape_matrix_b)
-        {
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
-        }
-        else
-        {
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
-        }
-
-        if(output->total_size() != 0)
-        {
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
-                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                     c,
-                                                                                     a_offset, b_offset));
-        }
-    }
-
-    return Status{};
+    return OperatorType::validate(a, b, c, output, gemm_info);
 }
 
 void CLGEMMLowpMatrixMultiplyCore::run()
 {
     prepare();
 
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_is_gemm_reshaped)
-    {
-        if(!_reshape_b_only_on_first_run)
-        {
-            // Run reshape matrix B
-            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
-        }
-    }
-
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
-    {
-        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
-    }
-
-    // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
-    {
-        CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    // Run matrix multiply
-    if(_is_gemm_reshaped)
-    {
-        CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_mm_native_kernel, false);
-    }
-    if(_run_output_stage)
-    {
-        // Run offset contribution/output stage kernel
-        CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
-    }
-    if(_run_offset_contribution)
-    {
-        // Run offset contribution kernel
-        CLScheduler::get().enqueue(_offset_contribution_kernel, true);
-    }
+    _impl->op->run(_impl->run_pack);
 }
 
 void CLGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        if(_convert_to_qasymm8)
-        {
-            _qasymm8_weights.allocator()->allocate();
-            CLScheduler::get().enqueue(_weights_to_qasymm8, false);
-        }
-
-        if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
-            // Run reshape kernel and mark original weights tensor as unused
-            _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
-            _original_b->mark_as_unused();
-        }
+        _impl->op->prepare(_impl->run_pack);
 
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && _reshape_b_only_on_first_run)
-        {
-            _vector_sum_col.allocator()->allocate();
-            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
-        }
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries(_impl->aux_mem_req, _impl->workspace_tensors);
 
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 9ae5d5121c..3dd8c5f101 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,249 +23,73 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 
-namespace arm_compute
-{
-void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset         = result_offset;
-    info.gemmlowp_multiplier     = result_mult_int;
-    info.gemmlowp_shift          = result_shift;
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset,
-                                                        int result_mult_int,
-                                                        int result_shift, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset         = result_offset;
-    info.gemmlowp_multiplier     = result_mult_int;
-    info.gemmlowp_shift          = result_shift;
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-    k->configure(compile_context, input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                    int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                    int min, int max)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-    _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                     int min, int max)
-{
-    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                   int min, int max)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-    _kernel = std::move(k);
-}
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h"
 
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                   int min, int max)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-    _kernel = std::move(k);
-}
+#include <algorithm>
 
-Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                    int min, int max)
+namespace arm_compute
 {
-    return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                               float multiplier, int offset,
-                                                               int min, int max)
+struct CLGEMMLowpOutputStage::Impl
 {
-    GEMMLowpOutputStageInfo info  = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset          = offset;
-    info.gemmlowp_real_multiplier = multiplier;
-    info.gemmlowp_min_bound       = min;
-    info.gemmlowp_max_bound       = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
-    _kernel = std::move(k);
-}
+    const ICLTensor                               *src{nullptr};
+    const ICLTensor                               *bias{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr};
+    ITensorPack                                    run_pack{};
+};
 
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                               float multiplier, int offset,
-                                                               int min, int max)
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
-    GEMMLowpOutputStageInfo info  = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset          = offset;
-    info.gemmlowp_real_multiplier = multiplier;
-    info.gemmlowp_min_bound       = min;
-    info.gemmlowp_max_bound       = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-    k->configure(compile_context, input, bias, output, &info);
-    _kernel = std::move(k);
 }
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&)            = default;
+CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage()                                   = default;
 
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                int min, int max)
+void CLGEMMLowpOutputStage::configure(const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-    return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift,
-                                                                    int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
 }
 
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                    int result_fixedpoint_multiplier, int result_shift,
-                                                                    int min, int max)
+void CLGEMMLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
-    _kernel = std::move(k);
-}
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                     int min, int max)
-{
-    return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
+    _impl->src  = input;
+    _impl->bias = bias;
+    _impl->dst  = output;
 
-void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
+    _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>();
+    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(),
+                         info);
+    _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}};
 }
 
-void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            switch(info.output_data_type)
-            {
-                case DataType::QASYMM8:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QSYMM16:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-                    k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported output data type.");
-            }
-            break;
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-            k->configure(compile_context, input, bias, output, &info);
-            _kernel = std::move(k);
-            break;
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-            k->configure(compile_context, input, bias, output, &info);
-            _kernel = std::move(k);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
-    }
+    return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info);
 }
 
-Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::run()
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            switch(output->data_type())
-            {
-                case DataType::QASYMM8:
-                    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QASYMM8_SIGNED:
-                    return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QSYMM16:
-                    return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                default:
-                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
-            }
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-            return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
-            return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
-        default:
-            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
-    }
+    _impl->op->run(_impl->run_pack);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index e2b18e0f55..2610cb1a3b 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 
 namespace arm_compute
 {
@@ -34,9 +35,14 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGather::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *input,
+                         const ICLTensor        *indices,
+                         ICLTensor              *output,
+                         int                     axis)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
+    auto k = std::make_unique<CLGatherKernel>();
     k->configure(compile_context, input, indices, output, axis);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
deleted file mode 100644
index 47367c4b17..0000000000
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
-
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
deleted file mode 100644
index 6b82cd0c35..0000000000
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
-{
-}
-
-void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-
-    // Configure kernels
-    _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-
-    // Allocate intermediate buffers
-    _tmp.allocator()->allocate();
-}
-
-void CLGaussian5x5::run()
-{
-    CLScheduler::get().enqueue(_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(_kernel_hor, false);
-    CLScheduler::get().enqueue(_kernel_vert);
-}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
deleted file mode 100644
index 1ac98787ac..0000000000
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-CLGaussianPyramid::CLGaussianPyramid()
-    : _input(nullptr), _pyramid(nullptr), _tmp()
-{
-}
-
-CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
-    : _horizontal_border_handler(),
-      _vertical_border_handler(),
-      _horizontal_reduction(),
-      _vertical_reduction()
-{
-}
-
-void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
-}
-
-void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
-
-    // Constant value to use for vertical fill border when the border mode is CONSTANT
-    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    if(num_levels > 1)
-    {
-        _horizontal_border_handler.resize(num_levels - 1);
-        _vertical_border_handler.resize(num_levels - 1);
-        _horizontal_reduction.resize(num_levels - 1);
-        _vertical_reduction.resize(num_levels - 1);
-
-        // Apply half scale to the X dimension of the tensor shape
-        TensorShape tensor_shape = pyramid->info()->tensor_shape();
-        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
-
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
-        _tmp.init(pyramid_info);
-
-        for(size_t i = 0; i < num_levels - 1; ++i)
-        {
-            /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
-
-            /* Configure vertical kernel */
-            _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
-
-            /* Configure border */
-            _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
-
-            /* Configure border */
-            _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
-        }
-        _tmp.allocate();
-    }
-}
-
-void CLGaussianPyramidHalf::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
-    _input->map(CLScheduler::get().queue(), true /* blocking */);
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-
-    _input->unmap(CLScheduler::get().queue());
-    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        CLScheduler::get().enqueue(_horizontal_border_handler[i], false);
-        CLScheduler::get().enqueue(_horizontal_reduction[i], false);
-        CLScheduler::get().enqueue(_vertical_border_handler[i], false);
-        CLScheduler::get().enqueue(_vertical_reduction[i], false);
-    }
-}
-
-CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
-    : _gauss5x5(),
-      _scale_nearest()
-{
-}
-
-void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
-}
-
-void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    if(num_levels > 1)
-    {
-        _gauss5x5.resize(num_levels - 1);
-        _scale_nearest.resize(num_levels - 1);
-
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
-
-        _tmp.init(pyramid_info);
-
-        for(size_t i = 0; i < num_levels - 1; ++i)
-        {
-            /* Configure gaussian 5x5 */
-            _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
-
-            /* Configure scale image kernel */
-            _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER);
-        }
-
-        _tmp.allocate();
-    }
-}
-
-void CLGaussianPyramidOrb::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
-    _input->map(CLScheduler::get().queue(), true /* blocking */);
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-    _input->unmap(CLScheduler::get().queue());
-    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        _gauss5x5[i].run();
-        CLScheduler::get().enqueue(_scale_nearest[i]);
-    }
-}
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 7f037fc51f..b2c1d2631e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,21 +25,29 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
-      _permute_deltas_kernel(),
-      _flatten_deltas_kernel(),
-      _permute_scores_kernel(),
-      _flatten_scores_kernel(),
-      _compute_anchors_kernel(),
-      _bounding_box_kernel(),
-      _pad_kernel(),
-      _dequantize_anchors(),
-      _dequantize_deltas(),
-      _quantize_all_proposals(),
+      _permute_deltas(),
+      _flatten_deltas(),
+      _permute_scores(),
+      _flatten_scores(),
+      _compute_anchors_kernel(std::make_unique<CLComputeAllAnchorsKernel>()),
+      _bounding_box_kernel(std::make_unique<CLBoundingBoxTransformKernel>()),
+      _pad_kernel(std::make_unique<CLPadLayerKernel>()),
+      _dequantize_anchors(std::make_unique<CLDequantizationLayer>()),
+      _dequantize_deltas(std::make_unique<CLDequantizationLayer>()),
+      _quantize_all_proposals(std::make_unique<CLQuantizationLayer>()),
       _cpp_nms(memory_manager),
       _is_nhwc(false),
       _is_qasymm8(false),
@@ -61,53 +69,75 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 {
 }
 
-void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
+
+void CLGenerateProposalsLayer::configure(const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out,
+              num_valid_proposals, info);
 }
 
-void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
-                                         ICLTensor *scores_out,
-                                         ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+void CLGenerateProposalsLayer::configure(const CLCompileContext      &compile_context,
+                                         const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
+                                         const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel.configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors,
+                                       ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_deltas_kernel.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
+        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
+        _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_deltas_kernel.configure(compile_context, deltas, &_deltas_flattened);
+        _flatten_deltas.configure(compile_context, deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -115,49 +145,50 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_scores_kernel.configure(compile_context, &_scores_permuted, &_scores_flattened);
+        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1});
+        _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_scores_kernel.configure(compile_context, scores, &_scores_flattened);
+        _flatten_scores.configure(compile_context, scores, &_scores_flattened);
     }
 
     CLTensor *anchors_to_use = &_all_anchors;
     CLTensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
         _memory_group.manage(&_all_anchors_f32);
         _memory_group.manage(&_deltas_flattened_f32);
         // Dequantize anchors to float
-        _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32);
+        _dequantize_anchors->configure(compile_context, &_all_anchors, &_all_anchors_f32);
         _all_anchors.allocator()->allocate();
         anchors_to_use = &_all_anchors_f32;
         // Dequantize deltas to float
-        _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
+        _dequantize_deltas->configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
         _deltas_flattened.allocator()->allocate();
         deltas_to_use = &_deltas_flattened_f32;
     }
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    _bounding_box_kernel->configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
     deltas_to_use->allocator()->allocate();
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
-        _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized);
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
     }
@@ -172,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -184,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     _num_valid_proposals = num_valid_proposals;
 
     _memory_group.manage(&_proposals_4_roi_values);
-    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
-                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values,
+                       &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height()));
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
     _all_proposals_to_use->allocator()->allocate();
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status CLGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -205,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -216,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
-
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
+
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                   BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -298,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -342,34 +409,34 @@ void CLGenerateProposalsLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
-    CLScheduler::get().enqueue(_compute_anchors_kernel, false);
+    CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
-        CLScheduler::get().enqueue(_permute_deltas_kernel, false);
-        CLScheduler::get().enqueue(_permute_scores_kernel, false);
+        _permute_deltas.run();
+        _permute_scores.run();
     }
-    CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
-    CLScheduler::get().enqueue(_flatten_scores_kernel, false);
+    _flatten_deltas.run();
+    _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
-        CLScheduler::get().enqueue(_dequantize_anchors, false);
-        CLScheduler::get().enqueue(_dequantize_deltas, false);
+        _dequantize_anchors->run();
+        _dequantize_deltas->run();
     }
 
     // Build the boxes
-    CLScheduler::get().enqueue(_bounding_box_kernel, false);
+    CLScheduler::get().enqueue(*_bounding_box_kernel, false);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
-        CLScheduler::get().enqueue(_quantize_all_proposals, false);
+        _quantize_all_proposals->run();
     }
 
     // Non maxima suppression
     run_cpp_nms_kernel();
     // Add dummy batch indexes
-    CLScheduler::get().enqueue(_pad_kernel, true);
+    CLScheduler::get().enqueue(*_pad_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
deleted file mode 100644
index 0645cfdf22..0000000000
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
-{
-}
-
-void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value);
-}
-
-void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    ARM_COMPUTE_ERROR_ON(nullptr == hog);
-
-    const HOGInfo *hog_info = hog->info();
-    const size_t   width    = input->info()->dimension(Window::DimX);
-    const size_t   height   = input->info()->dimension(Window::DimY);
-    const size_t   num_bins = hog_info->num_bins();
-
-    Size2D cell_size = hog_info->cell_size();
-
-    // Calculate number of cells along the x and y directions for the hog_space
-    const size_t num_cells_x = width / cell_size.width;
-    const size_t num_cells_y = height / cell_size.height;
-
-    // TensorShape of the input image
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // TensorShape of the hog space
-    TensorShape shape_hog_space = input->info()->tensor_shape();
-    shape_hog_space.set(Window::DimX, num_cells_x);
-    shape_hog_space.set(Window::DimY, num_cells_y);
-
-    // Intitialize tensors for magnitude, phase and hog space
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-    _hog_space.allocator()->init(info_space);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_hog_space);
-
-    // Initialise orientation binning kernel
-    _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
-
-    // Initialize HOG norm kernel
-    _block_norm.configure(compile_context, &_hog_space, output, hog->info());
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _hog_space.allocator()->allocate();
-}
-
-void CLHOGDescriptor::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run gradient
-    _gradient.run();
-
-    // Run orientation binning
-    CLScheduler::get().enqueue(_orient_bin, false);
-
-    // Run block normalization
-    CLScheduler::get().enqueue(_block_norm);
-}
-\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
deleted file mode 100644
index bf9bae1e8b..0000000000
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-CLHOGDetector::CLHOGDetector()
-    : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
-{
-}
-
-void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
-                              float threshold, size_t idx_class)
-{
-    _detection_windows = detection_windows;
-
-    // Allocate buffer for storing the number of detected objects
-    _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
-
-    // Configure HOGDetectorKernel
-    _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
-}
-
-void CLHOGDetector::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    // Reset number of detections
-    const unsigned int init_num_detection_windows = _detection_windows->num_values();
-    q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
-
-    // Run CLHOGDetectorKernel
-    CLScheduler::get().enqueue(_hog_detector_kernel);
-
-    // Read number of detections
-    unsigned int num_detection_windows = 0;
-    q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
-
-    // Update the number of values stored in _detection_windows
-    _detection_windows->resize(static_cast<size_t>(num_detection_windows));
-
-    q.flush();
-}
-\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
deleted file mode 100644
index acf5f2c568..0000000000
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
-{
-}
-
-void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value);
-}
-
-void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode,
-                              uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
-
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // Allocate image memory
-    TensorInfo info(shape_img, Format::S16);
-    _gx.allocator()->init(info);
-    _gy.allocator()->init(info);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Initialise derivate kernel
-    _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-
-    // Initialise magnitude/phase kernel
-    if(PhaseType::UNSIGNED == phase_type)
-    {
-        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
-    }
-    else
-    {
-        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
-    }
-
-    // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-}
-
-void CLHOGGradient::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run derivative
-    _derivative.run();
-
-    // Run magnitude/phase kernel
-    CLScheduler::get().enqueue(_mag_phase);
-}
-\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
deleted file mode 100644
index 248f7307e6..0000000000
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Scheduler.h"
-
-using namespace arm_compute;
-
-CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _gradient_kernel(),
-      _orient_bin_kernel(),
-      _block_norm_kernel(),
-      _hog_detect_kernel(),
-      _non_maxima_kernel(),
-      _hog_space(),
-      _hog_norm_space(),
-      _detection_windows(),
-      _mag(),
-      _phase(),
-      _non_maxima_suppression(false),
-      _num_orient_bin_kernel(0),
-      _num_block_norm_kernel(0),
-      _num_hog_detect_kernel(0)
-{
-}
-
-void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression,
-              min_distance);
-}
-
-void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows,
-                                    ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
-    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
-    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
-
-    const size_t       width      = input->info()->dimension(Window::DimX);
-    const size_t       height     = input->info()->dimension(Window::DimY);
-    const TensorShape &shape_img  = input->info()->tensor_shape();
-    const size_t       num_models = multi_hog->num_models();
-    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
-
-    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
-    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
-    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
-    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
-
-    /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
-     *
-     * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
-     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
-     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     *
-     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
-     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
-     */
-    std::vector<size_t> input_orient_bin;
-    std::vector<size_t> input_hog_detect;
-    std::vector<std::pair<size_t, size_t>> input_block_norm;
-
-    input_orient_bin.push_back(0);
-    input_hog_detect.push_back(0);
-    input_block_norm.emplace_back(0, 0);
-
-    for(size_t i = 1; i < num_models; ++i)
-    {
-        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
-        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
-        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
-        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
-
-        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
-        {
-            prev_num_bins     = cur_num_bins;
-            prev_cell_size    = cur_cell_size;
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute orientation binning and block normalization kernels. Update input to process
-            input_orient_bin.push_back(i);
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
-                || (cur_block_stride.height != prev_block_stride.height))
-        {
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute block normalization kernel. Update input to process
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-
-        // Update input to process for hog detector kernel
-        input_hog_detect.push_back(input_block_norm.size() - 1);
-    }
-
-    _detection_windows      = detection_windows;
-    _non_maxima_suppression = non_maxima_suppression;
-    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
-    _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
-    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
-
-    _orient_bin_kernel.resize(_num_orient_bin_kernel);
-    _block_norm_kernel.resize(_num_block_norm_kernel);
-    _hog_detect_kernel.resize(_num_hog_detect_kernel);
-    _hog_space.resize(_num_orient_bin_kernel);
-    _hog_norm_space.resize(_num_block_norm_kernel);
-
-    // Allocate tensors for magnitude and phase
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
-
-    // Configure NETensor for the HOG space and orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        const size_t idx_multi_hog = input_orient_bin[i];
-
-        // Get the corresponding cell size and number of bins
-        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
-        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
-
-        // Calculate number of cells along the x and y directions for the hog_space
-        const size_t num_cells_x = width / cell.width;
-        const size_t num_cells_y = height / cell.height;
-
-        // TensorShape of hog space
-        TensorShape shape_hog_space = input->info()->tensor_shape();
-        shape_hog_space.set(Window::DimX, num_cells_x);
-        shape_hog_space.set(Window::DimY, num_cells_y);
-
-        // Allocate HOG space
-        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-        _hog_space[i].allocator()->init(info_space);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_space[i]);
-
-        // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    // Configure CLTensor for the normalized HOG space and block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        const size_t idx_multi_hog  = input_block_norm[i].first;
-        const size_t idx_orient_bin = input_block_norm[i].second;
-
-        // Allocate normalized HOG space
-        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
-        _hog_norm_space[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_norm_space[i]);
-
-        // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
-    detection_window_strides->map(CLScheduler::get().queue(), true);
-
-    // Configure HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
-    {
-        const size_t idx_block_norm = input_hog_detect[i];
-
-        _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
-    }
-
-    detection_window_strides->unmap(CLScheduler::get().queue());
-
-    // Configure non maxima suppression kernel
-    _non_maxima_kernel.configure(_detection_windows, min_distance);
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        _hog_norm_space[i].allocator()->allocate();
-    }
-}
-
-void CLHOGMultiDetection::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Reset detection window
-    _detection_windows->clear();
-
-    // Run gradient
-    _gradient_kernel.run();
-
-    // Run orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        CLScheduler::get().enqueue(_orient_bin_kernel[i], false);
-    }
-
-    // Run block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        CLScheduler::get().enqueue(_block_norm_kernel[i], false);
-    }
-
-    // Run HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
-    {
-        _hog_detect_kernel[i].run();
-    }
-
-    // Run non-maxima suppression kernel if enabled
-    if(_non_maxima_suppression)
-    {
-        // Map detection windows array before computing non maxima suppression
-        _detection_windows->map(CLScheduler::get().queue(), true);
-        Scheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
-        _detection_windows->unmap(CLScheduler::get().queue());
-    }
-}
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
deleted file mode 100644
index aecec0d3c5..0000000000
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "arm_compute/runtime/Scheduler.h"
-#include "support/MemorySupport.h"
-
-#include <cmath>
-#include <utility>
-
-using namespace arm_compute;
-
-CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(nullptr),
-      _harris_score(),
-      _non_max_suppr(),
-      _candidates(),
-      _sort_euclidean(),
-      _border_gx(),
-      _border_gy(),
-      _gx(),
-      _gy(),
-      _score(),
-      _nonmax(),
-      _corners_list(),
-      _num_corner_candidates(0),
-      _corners(nullptr)
-{
-}
-
-void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
-                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16);
-}
-
-void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist,
-                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
-{
-    ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
-    ARM_COMPUTE_ERROR_ON(nullptr == corners);
-
-    _corners = corners;
-
-    const TensorShape shape = input->info()->tensor_shape();
-    const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
-    TensorInfo        tensor_info(shape, 1, dt);
-
-    _gx.allocator()->init(tensor_info);
-    _gy.allocator()->init(tensor_info);
-
-    TensorInfo info_f32(shape, 1, DataType::F32);
-    _score.allocator()->init(info_f32);
-    _nonmax.allocator()->init(info_f32);
-
-    _corners_list.resize(shape.x() * shape.y());
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    /* Set/init Sobel kernel accordingly with gradient_size */
-    switch(gradient_size)
-    {
-        case 3:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 5:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 7:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
-            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Gradient size not implemented");
-    }
-
-    // Normalization factor
-    const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
-    const float pow4_normalization_factor = pow(norm_factor, 4);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_score);
-
-    // Set/init Harris Score kernel accordingly with block_size
-    _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-
-    // Configure border filling using harris score kernel's block size
-    _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
-    _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
-
-    // Allocate intermediate buffers
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Init non-maxima suppression function
-    _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode);
-
-    // Allocate intermediate buffers
-    _score.allocator()->allocate();
-
-    // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
-
-    // Allocate intermediate buffers
-    _nonmax.allocator()->allocate();
-
-    // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.data(), _corners, &_num_corner_candidates, min_dist);
-}
-
-void CLHarrisCorners::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Init to 0 number of corner candidates
-    _num_corner_candidates = 0;
-
-    // Run Sobel kernel
-    _sobel->run();
-
-    // Fill border before harris score kernel
-    CLScheduler::get().enqueue(_border_gx, false);
-    CLScheduler::get().enqueue(_border_gy, false);
-
-    // Run harris score kernel
-    CLScheduler::get().enqueue(_harris_score, false);
-
-    // Run non-maxima suppression
-    _non_max_suppr.run();
-
-    // Run corner candidate kernel
-    _nonmax.map(true);
-    Scheduler::get().schedule(&_candidates, Window::DimY);
-    _nonmax.unmap();
-
-    _corners->map(CLScheduler::get().queue(), true);
-    Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
-    _corners->unmap(CLScheduler::get().queue());
-}
diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
deleted file mode 100644
index e723024334..0000000000
--- a/src/runtime/CL/functions/CLHistogram.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLHistogram.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLHistogram::CLHistogram()
-    : _kernel(), _kernel_border()
-{
-}
-
-void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
-{
-    _kernel.configure(compile_context, input, output);
-    _kernel_border.configure(compile_context, input, output);
-}
-
-void CLHistogram::run()
-{
-    CLScheduler::get().enqueue(_kernel, false);
-    CLScheduler::get().enqueue(_kernel_border);
-}
diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
new file mode 100644
index 0000000000..1a2369c5c2
--- /dev/null
+++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
+
+namespace arm_compute
+{
+struct CLIndirectConvolutionLayer::Impl
+{
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr};
+};
+
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&)            = default;
+CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer()                                        = default;
+
+void CLIndirectConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
+}
+
+void CLIndirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
+
+    _impl->src     = input;
+    _impl->weights = weights;
+    _impl->biases  = biases;
+    _impl->dst     = output;
+    _impl->op      = std::make_unique<opencl::ClIndirectConv2d>();
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+}
+
+Status CLIndirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info)
+{
+    return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
+}
+
+void CLIndirectConvolutionLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 273a873c81..0e994e1aee 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,29 +23,68 @@
  */
 #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
-CLInstanceNormalizationLayer::CLInstanceNormalizationLayer()
+CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT
+    : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx)
+{
+}
+CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer()
 {
 }
 
-void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(
+    ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
 }
 
-void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             float                   gamma,
+                                             float                   beta,
+                                             float                   epsilon,
+                                             bool                    use_mixed_precision)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernel>();
-    k->configure(compile_context, input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
-    _kernel = std::move(k);
+    ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision);
+    auto w = std::make_unique<CLComputeMeanVariance>();
+    w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision);
+    _mean_var_kernel = std::move(w);
+    auto k           = std::make_unique<CLInstanceNormalizationLayerKernel>();
+    k->configure(compile_context, input, &_mean_var_tensor, output,
+                 InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    _inst_norm_kernel = std::move(k);
+    _mean_var_tensor.allocator()->allocate();
 }
 
-Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input,
+                                              const ITensorInfo *output,
+                                              float              gamma,
+                                              float              beta,
+                                              float              epsilon,
+                                              bool               use_mixed_precision)
 {
-    return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    return CLInstanceNormalizationLayerKernel::validate(
+        input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
 }
-} // namespace arm_compute
-\ No newline at end of file
+
+void CLInstanceNormalizationLayer::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel,
+                             "The child class didn't set the CL kernel or function isn't configured");
+    schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get());
+    schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get());
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
deleted file mode 100644
index b3be2f8c2c..0000000000
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
-
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLIntegralImage::CLIntegralImage()
-    : _integral_hor(), _integral_vert()
-{
-}
-
-void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    _integral_hor.configure(compile_context, input, output);
-    _integral_vert.configure(compile_context, output);
-}
-
-void CLIntegralImage::run()
-{
-    CLScheduler::get().enqueue(_integral_hor, false);
-    CLScheduler::get().enqueue(_integral_vert);
-}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 14c83cd543..4fe1d9b20b 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,17 @@
 #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+
 namespace arm_compute
 {
 namespace
@@ -39,17 +43,25 @@ constexpr int max_input_tensor_dim = 3;
 } // namespace
 
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+    : _memory_group(std::move(memory_manager)),
+      _reduce_func(),
+      _normalize_kernel(std::make_unique<CLL2NormalizeLayerKernel>()),
+      _sumsq()
 {
 }
 
+CLL2NormalizeLayer::~CLL2NormalizeLayer() = default;
+
 void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayer::configure(
+    const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
+
     // Reset auxiliary tensor
     _sumsq.allocator()->init(TensorInfo());
 
@@ -59,7 +71,7 @@ void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLT
     // Configure kernels
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
     _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon);
+    _normalize_kernel->configure(compile_context, input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensor
     _sumsq.allocator()->allocate();
@@ -75,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);
@@ -91,6 +104,6 @@ void CLL2NormalizeLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
-    CLScheduler::get().enqueue(_normalize_kernel, true);
+    CLScheduler::get().enqueue(*_normalize_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 56f22e2fe0..3b50234c77 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,61 +24,172 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
+
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::info_helpers;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
-      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
-      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _ones_memset_kernel(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(),
-      _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(),
-      _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
-      _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
-      _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(),
-      _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
-      _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _ones_fill(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
+      _is_layer_norm_lstm(false)
 {
 }
 
-void CLLSTMLayer::configure(const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+CLLSTMLayer::~CLLSTMLayer() = default;
+
+void CLLSTMLayer::configure(const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in,
+              cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
               cell_threshold, projection_threshold);
 }
 
-void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const CLCompileContext      &compile_context,
+                            const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
 
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
+
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
     // Set lstm parameters
@@ -86,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
     // Configure block that calculates the forget gate
@@ -110,32 +220,37 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_forget_gate_out2);
-    _concat_inputs_forget_gate.configure(compile_context, input, output_state_in, &_forget_gate_out2);
+    _concat_inputs_forget_gate.configure(compile_context, inputs_vector, &_forget_gate_out2, Window::DimX);
 
     std::vector<const ICLTensor *> weights_vector;
 
     weights_vector.emplace_back(input_to_forget_weights);
     weights_vector.emplace_back(recurrent_to_forget_weights);
-    const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+    const TensorShape weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
     _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
 
-    _concat_weights_forget_gate.configure(compile_context, input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+    _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     CLTensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                             &_forget_gate_out4, 1, ConvertPolicy::SATURATE,
+                                             RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -144,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out,
+                                                   lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias,
+                                          &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -168,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     CLTensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(compile_context, ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
+        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1,
+                                       ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -185,23 +304,29 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
         std::vector<const ICLTensor *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
 
-        _concat_weights_input_gate.configure(compile_context, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+        _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
 
         _memory_group.manage(&_input_gate_out1);
 
         _memory_group.manage(&_input_gate_out3);
-        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
 
         input_gate_out = &_input_gate_out3;
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                &_input_gate_out4, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -211,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(compile_context, input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                      RoundingPolicy::TO_NEAREST_EVEN);
+            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out,
+                                                      lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1,
+                                                      1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(compile_context, input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -239,43 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights,
+                                          (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
-    _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2);
+    _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info());
+    _recurrent_to_cell_weights = recurrent_to_cell_weights;
     _memory_group.manage(&_cell_state_out3);
-    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f,
+                                0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4,
+                                 ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                 RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr,
+                                                 lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1,
+                                                 ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1,
+                                 ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -287,26 +426,29 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     std::vector<const ICLTensor *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
 
-    _concat_weights_output.configure(compile_context, input_to_output_weights, recurrent_to_output_weights, &_output2);
+    _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
 
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2,
+                                      (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     CLTensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(),
+                                               &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -318,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(compile_context, output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out,
+                                                   lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias,
+                                          &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(compile_context, output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -350,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out,
+                                           output_state_out_tmp, 1, ConvertPolicy::SATURATE,
+                                           RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_activation.allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -371,8 +521,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _copy_output.configure(compile_context, output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
-    std::vector<ICLTensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    std::vector<const ICLTensor *> scratch_inputs;
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -386,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     output_gate_out->allocator()->allocate();
 }
 
-Status CLLSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status CLLSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -427,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -448,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -459,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -477,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -514,98 +683,131 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
-    }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
-                                                                                                                    cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
     // Validate output gate tmp
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    if(lstm_params.has_projection())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                    1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_NEAREST_EVEN));
+    if (lstm_params.has_projection())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(output_state_out, output_state_out,
-                                                                          ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
     // Validate copy kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(&cell_state_tmp, cell_state_out));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(&cell_state_tmp, cell_state_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(output_state_out, output));
 
     // Validate scratch concatenation
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -623,110 +825,113 @@ void CLLSTMLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_concat_inputs_forget_gate);
+    _concat_inputs_forget_gate.run();
 
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_forget_gate);
+        _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
-        CLScheduler::get().enqueue(_pixelwise_mul_forget_gate_coeff);
-        CLScheduler::get().enqueue(_accum_forget_gate_bias);
+        _pixelwise_mul_forget_gate_coeff.run();
+        _accum_forget_gate_bias.run();
     }
-    CLScheduler::get().enqueue(_activation_forget_gate);
+    _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
-        CLScheduler::get().enqueue(_ones_memset_kernel);
-        CLScheduler::get().enqueue(_subtract_input_gate);
+        _ones_fill.run();
+        _subtract_input_gate.run();
     }
     else
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
-            CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
+            _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
-            CLScheduler::get().enqueue(_pixelwise_mul_input_gate_coeff);
-            CLScheduler::get().enqueue(_accum_input_gate_bias);
+            _pixelwise_mul_input_gate_coeff.run();
+            _accum_input_gate_bias.run();
         }
-        CLScheduler::get().enqueue(_activation_input_gate);
+        _activation_input_gate.run();
     }
 
     _fully_connected_cell_state.run();
-    CLScheduler::get().enqueue(_transpose_cell_state);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights);
+    pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2);
+    CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false);
     _gemm_cell_state1.run();
-    CLScheduler::get().enqueue(_accum_cell_state1);
-    if(_is_layer_norm_lstm)
+    _accum_cell_state1.run();
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
-        CLScheduler::get().enqueue(_pixelwise_mul_cell_gate_coeff);
-        CLScheduler::get().enqueue(_accum_cell_gate_bias);
+        _pixelwise_mul_cell_gate_coeff.run();
+        _accum_cell_gate_bias.run();
     }
-    CLScheduler::get().enqueue(_activation_cell_state);
-    CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
-    CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
-    CLScheduler::get().enqueue(_accum_cell_state2);
+    _activation_cell_state.run();
+    _pixelwise_mul_cell_state1.run();
+    _pixelwise_mul_cell_state2.run();
+    _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
-        CLScheduler::get().enqueue(_cell_clip);
+        _cell_clip.run();
     }
 
     _fully_connected_output.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_output_state1);
+        _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
-        CLScheduler::get().enqueue(_pixelwise_mul_output_gate_coeff);
-        CLScheduler::get().enqueue(_accum_output_gate_bias);
+        _pixelwise_mul_output_gate_coeff.run();
+        _accum_output_gate_bias.run();
     }
-    CLScheduler::get().enqueue(_activation_output);
+    _activation_output.run();
 
-    CLScheduler::get().enqueue(_activation_output_state);
-    CLScheduler::get().enqueue(_pixelwise_mul_output_state2);
+    _activation_output_state.run();
+    _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
-            CLScheduler::get().enqueue(_projection_clip);
+            _projection_clip.run();
         }
     }
 
-    CLScheduler::get().enqueue(_copy_cell_state);
-    CLScheduler::get().enqueue(_copy_output);
+    _copy_cell_state.run();
+    _copy_output.run();
 
     _concat_scratch_buffer.run();
 }
 
 void CLLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        CLScheduler::get().enqueue(_concat_weights_forget_gate);
-        if(!_run_cifg_opt)
+        _concat_weights_forget_gate.run();
+        if (!_run_cifg_opt)
         {
-            CLScheduler::get().enqueue(_concat_weights_input_gate);
+            _concat_weights_input_gate.run();
         }
-        CLScheduler::get().enqueue(_concat_weights_output);
+        _concat_weights_output.run();
         _is_prepared = true;
     }
 }
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index c57fcc9f21..ea64eda023 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,14 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
-#include <cmath>
 #include <memory>
-#include <tuple>
 
 namespace arm_compute
 {
@@ -44,44 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 } // namespace
 
 CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
-      _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
-      _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
-      _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
-      _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
-      _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
-      _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add_cell_state_tmps(),
+      _add2(),
+      _mul_forget_gate_cell_state(),
+      _mul_input_gate_input_mod_gate(),
+      _mul_output_state_tmp_output_gate(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state_tmp1(),
+      _cell_state_tmp2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
+      _is_prepared(false)
 {
 }
 
 void CLLSTMLayerQuantized::configure(const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+                                     const ICLTensor *input_to_input_weights,
+                                     const ICLTensor *input_to_forget_weights,
+                                     const ICLTensor *input_to_cell_weights,
+                                     const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights,
+                                     const ICLTensor *recurrent_to_forget_weights,
+                                     const ICLTensor *recurrent_to_cell_weights,
+                                     const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias,
+                                     const ICLTensor *forget_gate_bias,
+                                     const ICLTensor *cell_bias,
+                                     const ICLTensor *output_gate_bias,
+                                     ICLTensor       *cell_state_in,
+                                     const ICLTensor *output_state_in,
+                                     ICLTensor       *cell_state_out,
+                                     ICLTensor       *output_state_out)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
-              output_state_out);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias,
+              output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
 }
 
-void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context,
+                                     const ICLTensor        *input,
+                                     const ICLTensor        *input_to_input_weights,
+                                     const ICLTensor        *input_to_forget_weights,
+                                     const ICLTensor        *input_to_cell_weights,
+                                     const ICLTensor        *input_to_output_weights,
+                                     const ICLTensor        *recurrent_to_input_weights,
+                                     const ICLTensor        *recurrent_to_forget_weights,
+                                     const ICLTensor        *recurrent_to_cell_weights,
+                                     const ICLTensor        *recurrent_to_output_weights,
+                                     const ICLTensor        *input_gate_bias,
+                                     const ICLTensor        *forget_gate_bias,
+                                     const ICLTensor        *cell_bias,
+                                     const ICLTensor        *output_gate_bias,
+                                     ICLTensor              *cell_state_in,
+                                     const ICLTensor        *output_state_in,
+                                     ICLTensor              *cell_state_out,
+                                     ICLTensor              *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
+
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+                           output_state_out);
+
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -89,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -118,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
     std::vector<const ICLTensor *> weights_vector;
     weights_vector.emplace_back(&_recurrent_weights);
     weights_vector.emplace_back(&_input_weights);
 
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
 
@@ -138,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     input_vector.emplace_back(output_state_in);
 
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
 
     // Bias concatenation
@@ -153,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -163,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -174,90 +269,122 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     _memory_group.manage(&_output_lowp);
-    _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+
+    GEMMLowpOutputStageInfo info{};
+    info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    info.gemmlowp_multiplier = output_multiplier;
+    info.gemmlowp_shift      = output_shift;
+    info.output_data_type    = DataType::QSYMM16;
+    _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, info);
     _output_highp.allocator()->allocate();
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0},
+                                      {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input,
+                                     {2 * output_size, 0}, {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size},
+                                       {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size},
+                                       {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state_tmp1);
-    _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1,
+                                          ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state_tmp2);
-    _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output,
+                                             &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
-    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out,
+                                   ConvertPolicy::SATURATE);
     _cell_state_tmp1.allocator()->allocate();
     _cell_state_tmp2.allocator()->allocate();
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output,
+                                                &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -266,15 +393,29 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 }
 
 Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
 
     const int input_size  = input->dimension(0);
     const int batch_size  = input->dimension(1);
@@ -286,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                       recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                       recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights,
+                                                              input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -330,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -340,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -366,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -377,78 +542,107 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int         output_multiplier = 0;
     int         output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+    GEMMLowpOutputStageInfo info{};
+    info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    info.gemmlowp_multiplier = output_multiplier;
+    info.gemmlowp_shift      = output_shift;
+    info.output_data_type    = DataType::QSYMM16;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info));
 
     TensorInfo input_gate_input;
     TensorInfo forget_gate_input;
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -457,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -523,7 +717,7 @@ void CLLSTMLayerQuantized::run()
 
 void CLLSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
deleted file mode 100644
index 831f0cdcdf..0000000000
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-
-using namespace arm_compute;
-
-CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
-    : _num_levels(0),
-      _gaussian_pyr_function(),
-      _convf(),
-      _subf(),
-      _depth_function(),
-      _gauss_pyr(),
-      _conv_pyr()
-{
-}
-
-void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value);
-}
-
-void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    _num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the gaussian pyramid and the convoluted pyramid
-    PyramidInfo pyramid_info;
-    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
-
-    _gauss_pyr.init(pyramid_info);
-    _conv_pyr.init(pyramid_info);
-
-    // Create Gaussian Pyramid function
-    _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value);
-
-    _convf.resize(_num_levels);
-    _subf.resize(_num_levels);
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
-        _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
-    }
-
-    _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
-
-    _gauss_pyr.allocate();
-    _conv_pyr.allocate();
-}
-
-void CLLaplacianPyramid::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
-
-    _gaussian_pyr_function.run(); // compute gaussian pyramid
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _convf[i].run(); // convolute gaussian pyramid
-    }
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _subf[i].run(); // compute laplacian image
-    }
-
-    _depth_function.run();
-}
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
deleted file mode 100644
index ea6a3f9a98..0000000000
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
-    : _tmp_pyr(),
-      _addf(),
-      _scalef(),
-      _depthf()
-{
-}
-
-void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value);
-}
-
-void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
-    PyramidInfo pyramid_info;
-    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
-    _tmp_pyr.init(pyramid_info);
-
-    // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf.resize(num_levels);
-    _scalef.resize(num_levels - 1);
-
-    const size_t last_level = num_levels - 1;
-
-    _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
-
-    // Scale levels n-1 to 1, and add levels n-2 to 0
-    for(size_t l = 0; l < last_level; ++l)
-    {
-        _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
-        _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
-    }
-
-    // Convert level 0 from S16 to U8
-    _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
-
-    _tmp_pyr.allocate();
-}
-
-void CLLaplacianReconstruct::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
-
-    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
-
-    _addf[last_level].run();
-
-    // Run l = [last_level - 1, 0]
-    for(size_t l = last_level; l-- > 0;)
-    {
-        _scalef[l].run();
-        _addf[l].run();
-    }
-
-    _depthf.run();
-}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
deleted file mode 100644
index 950be5030f..0000000000
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <cmath>
-#include <tuple>
-
-using namespace arm_compute;
-
-namespace
-{
-void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                      TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
-{
-    ARM_COMPUTE_UNUSED(output);
-
-    const unsigned int kernel_width  = weights->dimension(0);
-    const unsigned int kernel_height = weights->dimension(1);
-
-    bool has_bias = (biases != nullptr);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0),
-                                                 input->dimension(1),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info);
-
-    const size_t mat_weights_cols = weights->dimension(3);
-    const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
-    const size_t mat_weights_num  = weights->dimension(4);
-
-    shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
-
-    const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = conv_w * conv_h;
-
-    shape_im2col = input->tensor_shape();
-    if(shape_im2col.num_dimensions() >= 3)
-    {
-        shape_im2col.remove_dimension(2);
-    }
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-
-    shape_gemm = shape_im2col;
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-}
-} // namespace
-
-CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_prepared(false), _original_weights(nullptr)
-{
-}
-
-Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
-
-    bool has_bias = (biases != nullptr);
-
-    if(has_bias)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
-    }
-
-    const unsigned int kernel_width  = weights->dimension(0);
-    const unsigned int kernel_height = weights->dimension(1);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
-
-    // Calculate intermediate buffer shapes
-    TensorShape shape_wr;
-    TensorShape shape_im2col;
-    TensorShape shape_gemm;
-    calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
-
-    TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
-    TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
-    TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
-
-    return Status{};
-}
-
-void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
-}
-
-void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                        const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
-
-    bool _has_bias    = (biases != nullptr);
-    _original_weights = weights;
-    _is_prepared      = false;
-
-    const unsigned int kernel_width  = weights->info()->dimension(0);
-    const unsigned int kernel_height = weights->info()->dimension(1);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    // Calculate intermediate buffer shapes
-    TensorShape shape_wr;
-    TensorShape shape_im2col;
-    TensorShape shape_gemm;
-    calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
-
-    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_input_im2col_reshaped);
-    _memory_group.manage(&_gemm_output);
-
-    // Configure kernels
-    _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped);
-    _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h));
-
-    // Allocate intermediate tensors
-    _input_im2col_reshaped.allocator()->allocate();
-    _gemm_output.allocator()->allocate();
-
-    CLScheduler::get().tune_kernel_static(_input_im2col_kernel);
-}
-
-void CLLocallyConnectedLayer::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run input reshaping
-    CLScheduler::get().enqueue(_input_im2col_kernel);
-
-    // Runs vector matrix multiply on reshaped matrices
-    CLScheduler::get().enqueue(_mm_kernel);
-
-    // Reshape output matrix
-    CLScheduler::get().enqueue(_output_col2im_kernel, false);
-}
-
-void CLLocallyConnectedLayer::prepare()
-{
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights reshaping and mark original weights tensor as unused
-        _weights_reshaped.allocator()->allocate();
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
-        _original_weights->mark_as_unused();
-
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
-    }
-}
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
new file mode 100644
index 0000000000..ea21c54bc3
--- /dev/null
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace experimental
+{
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *input1,
+                             ITensorInfo            *input2,
+                             ITensorInfo            *output)
+{
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
+    k->configure(compile_context, LogicalOperation::And, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CLLogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return arm_compute::opencl::kernels::ClLogicalBinaryKernel::validate(LogicalOperation::And, input1, input2, output);
+}
+
+void CLLogicalAnd::run(ITensorPack &tensors)
+{
+    ICLOperator::run(tensors);
+}
+} // namespace experimental
+
+struct CLLogicalAnd::Impl
+{
+    const ICLTensor                            *src0{nullptr};
+    const ICLTensor                            *src1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalAnd> op{nullptr};
+};
+
+CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>())
+{
+}
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&)            = default;
+CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
+CLLogicalAnd::~CLLogicalAnd()                          = default;
+
+void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output)
+{
+    _impl->src0 = input1;
+    _impl->src1 = input2;
+    _impl->dst  = output;
+    _impl->op   = std::make_unique<experimental::CLLogicalAnd>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info());
+}
+
+Status CLLogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return experimental::CLLogicalAnd::validate(input1, input2, output);
+}
+
+void CLLogicalAnd::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
new file mode 100644
index 0000000000..71f9cce54f
--- /dev/null
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClLogicalNot.h"
+
+namespace arm_compute
+{
+struct CLLogicalNot::Impl
+{
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClLogicalNot> op{nullptr};
+};
+
+CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>())
+{
+}
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&)            = default;
+CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
+CLLogicalNot::~CLLogicalNot()                          = default;
+
+void CLLogicalNot::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLLogicalNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClLogicalNot>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+
+Status CLLogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return opencl::ClLogicalNot::validate(input, output);
+}
+
+void CLLogicalNot::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
new file mode 100644
index 0000000000..3db4fdae84
--- /dev/null
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace experimental
+{
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ITensorInfo            *input1,
+                            ITensorInfo            *input2,
+                            ITensorInfo            *output)
+{
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
+    k->configure(compile_context, LogicalOperation::Or, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CLLogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return arm_compute::opencl::kernels::ClLogicalBinaryKernel::validate(LogicalOperation::Or, input1, input2, output);
+}
+
+void CLLogicalOr::run(ITensorPack &tensors)
+{
+    ICLOperator::run(tensors);
+}
+} // namespace experimental
+
+struct CLLogicalOr::Impl
+{
+    const ICLTensor                           *src0{nullptr};
+    const ICLTensor                           *src1{nullptr};
+    ICLTensor                                 *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalOr> op{nullptr};
+};
+
+CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>())
+{
+}
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&)            = default;
+CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
+CLLogicalOr::~CLLogicalOr()                         = default;
+
+void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ICLTensor              *input1,
+                            ICLTensor              *input2,
+                            ICLTensor              *output)
+{
+    _impl->src0 = input1;
+    _impl->src1 = input2;
+    _impl->dst  = output;
+    _impl->op   = std::make_unique<experimental::CLLogicalOr>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info());
+}
+
+Status CLLogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return experimental::CLLogicalOr::validate(input1, input2, output);
+}
+
+void CLLogicalOr::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
deleted file mode 100644
index a267952d4a..0000000000
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
-
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type);
-}
-
-void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(compile_context, input1, input2, output, nullptr, mag_type);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
new file mode 100644
index 0000000000..e8bdad706b
--- /dev/null
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/operators/ClMatMul.h"
+
+namespace arm_compute
+{
+using OperatorType = opencl::ClMatMul;
+
+struct CLMatMul::Impl
+{
+    std::unique_ptr<OperatorType> op{nullptr};
+    ITensorPack                   run_pack{};
+};
+CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLMatMul::~CLMatMul() = default;
+
+void CLMatMul::configure(ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(settings);
+    configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
+}
+
+void CLMatMul::configure(const CLCompileContext    &compile_context,
+                         ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
+    ARM_COMPUTE_UNUSED(settings);
+
+    _impl->op = std::make_unique<OperatorType>();
+    _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
+    _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
+}
+
+Status CLMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
+{
+    return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
+}
+
+void CLMatMul::run()
+{
+    _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
new file mode 100644
index 0000000000..7494f379b9
--- /dev/null
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+
+namespace arm_compute
+{
+CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
+    : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
+{
+}
+
+CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
+
+void CLMaxUnpoolingLayer::configure(ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
+}
+
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context,
+                                    ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
+    const PixelValue zero_value(0.f);
+    _fill.configure(output, zero_value);
+
+    _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
+}
+
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
+{
+    return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
+}
+
+void CLMaxUnpoolingLayer::run()
+{
+    // Run fill
+    _fill.run();
+
+    // Run max unpooling layer
+    CLScheduler::get().enqueue(*_unpooling_layer_kernel);
+}
+} /* namespace arm_compute */
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
deleted file mode 100644
index e3ce704bfb..0000000000
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/TensorInfo.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
-
-using namespace arm_compute;
-
-CLMeanStdDev::CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _data_type(),
-      _num_pixels(),
-      _run_stddev(),
-      _reduction_operation_mean(),
-      _reduction_operation_stddev(),
-      _reduction_output_mean(),
-      _reduction_output_stddev(),
-      _mean(nullptr),
-      _stddev(nullptr),
-      _mean_stddev_kernel(),
-      _fill_border_kernel(),
-      _global_sum(),
-      _global_sum_squared()
-{
-}
-
-Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
-    if(is_data_type_float(input->data_type()))
-    {
-        ARM_COMPUTE_UNUSED(mean);
-        ARM_COMPUTE_UNUSED(stddev);
-
-        TensorShape output_shape      = TensorShape{ 1, input->dimension(1) };
-        TensorInfo  output_shape_info = TensorInfo(output_shape, 1, DataType::U8);
-        return CLReductionOperation::validate(input, &output_shape_info, 0, ReductionOperation::SUM);
-    }
-    else
-    {
-        return CLMeanStdDevKernel::validate(input, mean, nullptr, stddev, nullptr);
-    }
-}
-
-void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev);
-}
-
-void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev)
-{
-    // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev
-    _data_type = input->info()->data_type();
-
-    if(is_data_type_float(_data_type))
-    {
-        _num_pixels = input->info()->dimension(0) * input->info()->dimension(1);
-
-        _memory_group.manage(&_reduction_output_mean);
-        _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM);
-        _reduction_output_mean.allocator()->allocate();
-        _mean = mean;
-
-        if(stddev != nullptr)
-        {
-            _memory_group.manage(&_reduction_output_stddev);
-            _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
-            _reduction_output_stddev.allocator()->allocate();
-            _stddev     = stddev;
-            _run_stddev = true;
-        }
-    }
-    else
-    {
-        _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
-
-        if(stddev != nullptr)
-        {
-            _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
-        }
-
-        _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
-        _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
-    }
-}
-
-template <typename T>
-void CLMeanStdDev::run_float()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Perform reduction on x-axis
-    _reduction_operation_mean.run();
-    if(_run_stddev)
-    {
-        _reduction_operation_stddev.run();
-        _reduction_output_stddev.map(true);
-    }
-
-    _reduction_output_mean.map(true);
-
-    auto mean = static_cast<T>(0);
-
-    // Calculate final result for mean
-    for(unsigned int i = 0; i < _reduction_output_mean.info()->dimension(1); ++i)
-    {
-        mean += *reinterpret_cast<T *>(_reduction_output_mean.buffer() + _reduction_output_mean.info()->offset_element_in_bytes(Coordinates(0, i)));
-    }
-
-    mean /= _num_pixels;
-    *_mean = mean;
-
-    if(_run_stddev)
-    {
-        auto stddev = static_cast<T>(0);
-        // Calculate final result for stddev
-        for(unsigned int i = 0; i < _reduction_output_stddev.info()->dimension(1); ++i)
-        {
-            stddev += *reinterpret_cast<T *>(_reduction_output_stddev.buffer() + _reduction_output_stddev.info()->offset_element_in_bytes(Coordinates(0, i)));
-        }
-        *_stddev = std::sqrt((stddev / _num_pixels) - (mean * mean));
-
-        _reduction_output_stddev.unmap();
-    }
-    _reduction_output_mean.unmap();
-}
-
-void CLMeanStdDev::run_int()
-{
-    CLScheduler::get().enqueue(_fill_border_kernel);
-    CLScheduler::get().enqueue(_mean_stddev_kernel);
-}
-
-void CLMeanStdDev::run()
-{
-    switch(_data_type)
-    {
-        case DataType::F16:
-            run_float<half>();
-            break;
-        case DataType::F32:
-            run_float<float>();
-            break;
-        case DataType::U8:
-            run_int();
-            break;
-        default:
-            ARM_COMPUTE_ERROR_ON("Not supported");
-    }
-}
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index 3dbab76c72..5892c0e840 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -34,9 +35,13 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                               ICLTensor              *input,
+                                               ICLTensor              *output,
+                                               float                   epsilon)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLMeanStdDevNormalizationKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
+    auto k = std::make_unique<CLMeanStdDevNormalizationKernel>();
     k->configure(compile_context, input, output, epsilon);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
deleted file mode 100644
index dc53240f79..0000000000
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
-
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
-}
-
-void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
deleted file mode 100644
index 15b28330b5..0000000000
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-
-namespace arm_compute
-{
-CLMinMaxLocation::CLMinMaxLocation()
-    : _min_max_kernel(),
-      _min_max_loc_kernel(),
-      _min_max_vals(),
-      _min_max_count_vals(),
-      _min(nullptr),
-      _max(nullptr),
-      _min_count(nullptr),
-      _max_count(nullptr),
-      _min_loc(nullptr),
-      _max_loc(nullptr)
-{
-}
-
-void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count);
-}
-
-void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc,
-                                 uint32_t *min_count,
-                                 uint32_t *max_count)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == min);
-    ARM_COMPUTE_ERROR_ON(nullptr == max);
-
-    _min_max_vals       = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t));
-    _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t));
-    _min                = min;
-    _max                = max;
-    _min_count          = min_count;
-    _max_count          = max_count;
-    _min_loc            = min_loc;
-    _max_loc            = max_loc;
-
-    _min_max_kernel.configure(compile_context, input, &_min_max_vals);
-    _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
-}
-
-void CLMinMaxLocation::run()
-{
-    cl::CommandQueue q = CLScheduler::get().queue();
-
-    CLScheduler::get().enqueue(_min_max_kernel, false);
-    CLScheduler::get().enqueue(_min_max_loc_kernel, false);
-
-    // Update min and max
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
-    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_max));
-
-    // Update min and max count
-    if(_min_count != nullptr)
-    {
-        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count);
-    }
-    if(_max_count != nullptr)
-    {
-        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count);
-    }
-
-    // Update min/max point arrays (Makes the kernel blocking)
-    if(_min_loc != nullptr)
-    {
-        unsigned int min_count = 0;
-        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count);
-        size_t min_corner_size = std::min(static_cast<size_t>(min_count), _min_loc->max_num_values());
-        _min_loc->resize(min_corner_size);
-    }
-    if(_max_loc != nullptr)
-    {
-        unsigned int max_count = 0;
-        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count);
-        size_t max_corner_size = std::min(static_cast<size_t>(max_count), _max_loc->max_num_values());
-        _max_loc->resize(max_corner_size);
-    }
-}
-} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
deleted file mode 100644
index 96912a21cd..0000000000
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
-
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                  BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value);
-}
-
-void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern,
-                                  const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
-    k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
deleted file mode 100644
index 6d4a28db26..0000000000
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
-
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode);
-}
-
-void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
-    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
-    }
-    else
-    {
-        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
-    }
-}
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index f59a4ca959..f93f82f1a2 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,44 +25,66 @@
 #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
+namespace arm_compute
+{
 CLNormalizationLayer::CLNormalizationLayer()
-    : _norm_kernel(), _border_handler()
+    : _norm_kernel(std::make_unique<CLNormalizationLayerKernel>()),
+      _border_handler(std::make_unique<CLFillBorderKernel>())
 {
 }
 
+CLNormalizationLayer::~CLNormalizationLayer() = default;
+
 void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+void CLNormalizationLayer::configure(const CLCompileContext       &compile_context,
+                                     ICLTensor                    *input,
+                                     ICLTensor                    *output,
+                                     const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
 
     // Configure normalization kernel
-    _norm_kernel.configure(compile_context, input, output, norm_info);
+    _norm_kernel->configure(compile_context, input, output, norm_info);
 
-    // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
+    if (!_norm_kernel->border_size().empty())
+    {
+        // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT,
+                                   PixelValue());
+    }
 }
 
-Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status CLNormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     return CLNormalizationLayerKernel::validate(input, output, norm_info);
 }
 
 void CLNormalizationLayer::run()
 {
-    // Run border handler
-    CLScheduler::get().enqueue(_border_handler, false);
+    if (!_norm_kernel->border_size().empty())
+    {
+        // Run border handler
+        CLScheduler::get().enqueue(*_border_handler, false);
+    }
 
     // Run normalization kernel
-    CLScheduler::get().enqueue(_norm_kernel);
+    CLScheduler::get().enqueue(*_norm_kernel);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index b03de6475b..939c95bd45 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,37 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input,
+                                          ICLTensor       *output,
+                                          const ICLTensor *mean,
+                                          const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *std)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLNormalizePlanarYUVLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, mean, std);
+    auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>();
     k->configure(compile_context, input, output, mean, std);
     _kernel = std::move(k);
 }
 
-Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input,
+                                           const ITensorInfo *output,
+                                           const ITensorInfo *mean,
+                                           const ITensorInfo *std)
 {
     return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
 }
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
deleted file mode 100644
index 5f7c1704ee..0000000000
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _tracker_init_kernel(),
-      _tracker_stage0_kernel(),
-      _tracker_stage1_kernel(),
-      _tracker_finalize_kernel(),
-      _func_scharr(),
-      _scharr_gx(),
-      _scharr_gy(),
-      _old_points(nullptr),
-      _new_points_estimates(nullptr),
-      _new_points(nullptr),
-      _old_points_internal(),
-      _new_points_internal(),
-      _coefficient_table(),
-      _old_values(),
-      _num_levels(0)
-{
-}
-
-void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                              BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension,
-              use_initial_estimate, border_mode, constant_border_value);
-}
-
-void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                              BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
-
-    // Set member variables
-    _old_points           = old_points;
-    _new_points_estimates = new_points_estimates;
-    _new_points           = new_points;
-    _num_levels           = old_pyramid->info()->num_levels();
-
-    const float pyr_scale              = old_pyramid->info()->scale();
-    const int   list_length            = old_points->num_values();
-    const int   old_values_list_length = list_length * window_dimension * window_dimension;
-
-    // Create kernels and tensors
-    _tracker_init_kernel.resize(_num_levels);
-    _tracker_stage0_kernel.resize(_num_levels);
-    _tracker_stage1_kernel.resize(_num_levels);
-    _func_scharr.resize(_num_levels);
-    _scharr_gx.resize(_num_levels);
-    _scharr_gy.resize(_num_levels);
-
-    // Create internal keypoint arrays
-    _old_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
-    _old_points_internal->resize(list_length);
-    _new_points_internal = arm_compute::support::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
-    _new_points_internal->resize(list_length);
-    _coefficient_table = arm_compute::support::cpp14::make_unique<CLCoefficientTableArray>(list_length);
-    _coefficient_table->resize(list_length);
-    _old_values = arm_compute::support::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
-    _old_values->resize(old_values_list_length);
-    _new_points->resize(list_length);
-
-    for(size_t i = 0; i < _num_levels; ++i)
-    {
-        // Get images from the ith level of old and right pyramid
-        ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i);
-        ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i);
-
-        // Get width and height of images
-        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
-        const unsigned int height_ith = new_ith_input->info()->dimension(1);
-
-        // Initialize Scharr tensors
-        TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16);
-        _scharr_gx[i].allocator()->init(tensor_info);
-        _scharr_gy[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_scharr_gx[i]);
-        _memory_group.manage(&_scharr_gy[i]);
-
-        // Init Scharr kernel
-        _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
-
-        // Init Lucas-Kanade init kernel
-        _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
-
-        // Init Lucas-Kanade stage0 kernel
-        _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                            _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                            window_dimension, i);
-
-        // Init Lucas-Kanade stage1 kernel
-        _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                            termination, epsilon, num_iterations, window_dimension, i);
-
-        // Allocate intermediate buffers
-        _scharr_gx[i].allocator()->allocate();
-        _scharr_gy[i].allocator()->allocate();
-    }
-
-    // Finalize Lucas-Kanade
-    _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points);
-}
-
-void CLOpticalFlow::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    for(unsigned int level = _num_levels; level > 0; --level)
-    {
-        // Run Scharr kernel
-        _func_scharr[level - 1].run();
-
-        // Run Lucas-Kanade init kernel
-        CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]);
-
-        // Run Lucas-Kanade stage0 kernel
-        CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]);
-
-        // Run Lucas-Kanade stage1 kernel
-        CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]);
-    }
-
-    CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
-}
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 6543ab922e..ce6d285ebe 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,45 +21,63 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
-#include "support/MemorySupport.h"
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/operators/ClPRelu.h"
 
 namespace arm_compute
 {
-namespace
-{
-void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+using OperatorType = opencl::ClPRelu;
+
+struct CLPReluLayer::Impl
 {
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    const ICLTensor              *src_0{nullptr};
+    const ICLTensor              *src_1{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
+};
 
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
-        }
-    }
+CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>())
+{
 }
-} // namespace
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&)            = default;
+CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
+CLPReluLayer::~CLPReluLayer()                          = default;
 
 void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
 }
 
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *alpha,
+                             ICLTensor              *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output);
-    _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input, alpha, output);
+    _impl->src_0 = input;
+    _impl->src_1 = alpha;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<OperatorType>();
+    _impl->op->configure(compile_context, input->info(), alpha->info(),
+                         (output == nullptr ? input->info() : output->info()));
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
 {
-    return CLArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
+    return OperatorType::validate(input, alpha, output);
+}
+
+void CLPReluLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 078bdbc51f..e788ded512 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,65 +23,74 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+
 namespace arm_compute
 {
-CLPadLayer::CLPadLayer()
-    : _pad_kernel(), _copy_kernel(), _perform_pad(false)
+CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false)
 {
 }
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+CLPadLayer::~CLPadLayer() = default;
+
+void CLPadLayer::configure(
+    ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(const CLCompileContext &compile_context,
+                           ICLTensor              *input,
+                           ICLTensor              *output,
+                           const PaddingList      &padding,
+                           PixelValue              constant_value,
+                           PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+    ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
 
-    _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    _perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(_perform_pad)
+    if (_perform_pad)
     {
-        _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode);
+        _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
     }
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel.configure(compile_context, input, output);
+        _copy.configure(compile_context, input, output);
     }
 }
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            PixelValue         constant_value,
+                            PaddingMode        mode)
 {
-    bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    bool perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(perform_pad)
+    if (perform_pad)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
     }
     else
     {
-        Window copy_window = Window();
-        copy_window.use_tensor_dimensions(output->tensor_shape());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(input, output));
     }
     return Status{};
 }
 void CLPadLayer::run()
 {
-    if(_perform_pad)
+    if (_perform_pad)
     {
-        CLScheduler::get().enqueue(_pad_kernel);
+        CLScheduler::get().enqueue(*_pad_kernel);
     }
     else
     {
-        CLScheduler::get().enqueue(_copy_kernel);
+        _copy.run();
     }
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index e6323ce504..7f97eed98a 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,28 +23,60 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
-#include "arm_compute/core/Error.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
 
 namespace arm_compute
 {
+struct CLPermute::Impl
+{
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClPermute> op{nullptr};
+};
+
+CLPermute::CLPermute() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLPermute::~CLPermute() = default;
+
 void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
 }
 
-void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+void CLPermute::configure(const CLCompileContext  &compile_context,
+                          const ICLTensor         *input,
+                          ICLTensor               *output,
+                          const PermutationVector &perm)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLPermuteKernel>();
-    k->configure(compile_context, input, output, perm);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output, perm);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClPermute>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), perm);
 }
 
 Status CLPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(input, output, perm));
-    return Status{};
+    return opencl::ClPermute::validate(input, output, perm);
+}
+
+void CLPermute::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
deleted file mode 100644
index b915104f38..0000000000
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLPhase.h"
-
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type);
-}
-
-void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 3c1a7de76d..6aa9d9cbb3 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,67 +24,132 @@
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClMul.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+struct CLPixelWiseMultiplication::Impl
+{
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClMul> op{nullptr};
+};
+
+CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
+{
+}
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&)            = default;
+CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::~CLPixelWiseMultiplication()                                       = default;
+
+void CLPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy,
+              rounding_policy, act_info);
+}
+
+void CLPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                          ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClMul>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy,
+                         rounding_policy, act_info);
 }
 
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status CLPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
+                                           const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
-    k->configure(compile_context, input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
+    return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
 
-Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::run()
 {
-    return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+struct CLComplexPixelWiseMultiplication::Impl
+{
+    const ICLTensor                      *src_0{nullptr};
+    const ICLTensor                      *src_1{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClComplexMul> op{nullptr};
+};
+
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
+{
+}
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication &
+CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()            = default;
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                                 ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
-    k->configure(compile_context, input1, input2, output, act_info);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<opencl::ClComplexMul>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
-    return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output, act_info);
+    return opencl::ClComplexMul::validate(input1, input2, output, act_info);
+}
+
+void CLComplexPixelWiseMultiplication::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp
new file mode 100644
index 0000000000..ce1092a7cc
--- /dev/null
+++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPool3d.h"
+
+namespace arm_compute
+{
+struct CLPooling3dLayer::Impl
+{
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool3d> op{nullptr};
+};
+
+CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLPooling3dLayer::~CLPooling3dLayer() = default;
+
+void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info);
+}
+
+void CLPooling3dLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 ICLTensor                *output,
+                                 const Pooling3dLayerInfo &pool_info)
+{
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClPool3d>();
+    _impl->op->configure(compile_context, input->info(), output->info(), pool_info);
+}
+
+Status
+CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+{
+    return opencl::ClPool3d::validate(input, output, pool_info);
+}
+
+void CLPooling3dLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index e7735b00df..65e53b9be3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,70 +23,64 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPool2d.h"
 
 namespace arm_compute
 {
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+struct CLPoolingLayer::Impl
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
-}
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool2d> op{nullptr};
+};
 
-void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>())
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    // Configure pooling kernel
-    auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, input, output, pool_info, indices);
-    _kernel = std::move(k);
-
-    const DataType data_type = input->info()->data_type();
+}
+CLPoolingLayer::~CLPoolingLayer() = default;
 
-    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode{};
-    PixelValue pixel_value(0.f);
-    if(is_data_type_quantized_asymmetric(data_type) && !pool_info.exclude_padding)
-    {
-        pixel_value = PixelValue(0, data_type, input->info()->quantization_info());
-    }
+void CLPoolingLayer::configure(ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
+}
 
-    // Data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
+void CLPoolingLayer::configure(const CLCompileContext &compile_context,
+                               ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
+{
+    _impl->src     = input;
+    _impl->dst     = output;
+    _impl->indices = indices;
 
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-            border_mode = (PoolingType::MAX == pool_info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-            break;
-        case DataLayout::NHWC:
-            border_mode = BorderMode::CONSTANT;
-            if(PoolingType::MAX == pool_info.pool_type)
-            {
-                if(is_data_type_quantized(data_type))
-                {
-                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
-                }
-                else
-                {
-                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
-                }
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value);
+    _impl->op = std::make_unique<opencl::ClPool2d>();
+    _impl->op->configure(compile_context, input->info(), output->info(), pool_info,
+                         (indices) ? indices->info() : nullptr);
+}
 
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_kernel);
+Status CLPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
+{
+    return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }
 
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+void CLPoolingLayer::run()
 {
-    return CLPoolingLayerKernel::validate(input, output, pool_info, indices);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+    pack.add_tensor(TensorType::ACL_DST_1, _impl->indices);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index d01b4c711b..cfd0ec4fbf 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,40 +24,56 @@
 
 #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
+
 using namespace arm_compute;
 
-CLPriorBoxLayer::CLPriorBoxLayer()
-    : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
 {
 }
 
-void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
 }
 
-void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const CLCompileContext  &compile_context,
+                                const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
-    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
-    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
-    if(!info.max_sizes().empty())
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
+    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.min_sizes().size() * sizeof(float));
+    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.aspect_ratios().size() * sizeof(float));
+    if (!info.max_sizes().empty())
     {
-        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                          info.max_sizes().size() * sizeof(float));
     }
 
-    auto k = arm_compute::support::cpp14::make_unique<CLPriorBoxLayerKernel>();
+    auto k = std::make_unique<CLPriorBoxLayerKernel>();
     k->configure(compile_context, input1, input2, output, info, &_min, &_max, &_aspect_ratios);
     _kernel = std::move(k);
 }
 
-Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
-}
-\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 524c7b3aae..12f6f89290 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,22 +26,36 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+
 namespace arm_compute
 {
 using namespace arm_compute::utils::info_helpers;
+using namespace arm_compute::opencl::kernels;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -71,28 +85,71 @@ void CLQLSTMLayer::TensorCopyKernel::run()
     _src->map(q, true);
     _dst->map(q, true);
 
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 
     _src->unmap(q);
     _dst->unmap(q);
 }
 
 CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _input_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_input_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _input_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_forget_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _input_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_cell_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _input_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _recurrent_to_output_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _projection_reduction(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _layer_norms(),
+      _copy_output()
 {
+    for (auto &norm : _layer_norms)
+    {
+        norm = std::make_unique<CLQLSTMLayerNormalizationKernel>();
+    }
+
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
-void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
-                                CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+CLQLSTMLayer::~CLQLSTMLayer() = default;
+
+void CLQLSTMLayer::configure_layer_norm(LayerNormGate g, const ICLTensor *in)
+{
+    ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
+
+    CLTensor *out = &get_layer_norm_output(g);
+    _memory_group.manage(out);
+    out->allocator()->init(*(in->info()));
+
+    get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g));
+}
+
+Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
+{
+    // Output quantization scale will be different, but ignored here
+    // since it will be configured at configure() stage.
+    const TensorInfo out{in};
+    return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
+}
+
+void CLQLSTMLayer::configure_mm(const CLCompileContext       &compile_context,
+                                CLGEMMLowpMatrixMultiplyCore &mm,
+                                CLGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ICLTensor              *mm_input,
+                                const ICLTensor              *mm_weights,
+                                const ICLTensor              *bias,
+                                CLTensor                     *mm_res,
+                                CLTensor                     *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -104,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML
     mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void CLQLSTMLayer::configure(const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
-              cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+              output_state_in, cell_state_out, output_state_out, output, lstm_params);
 }
 
-void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const CLCompileContext      &compile_context,
+                             const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -135,16 +213,20 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
                                  forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
                                  cell_state_out, output_state_out, output);
 
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                           cell_state_out, output_state_out, output, lstm_params);
     // Set lstm parameters
     LSTMParams<ITensorInfo> lstm_params_info{};
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                      cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                      lstm_params_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info));
 
     const int batch_size  = input->info()->dimension(1);
     const int num_units   = input_to_output_weights->info()->dimension(1);
@@ -165,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -187,45 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(),
+                                             _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(),
+                                          _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(),
+                                          _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
-        _projection_reduction.configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction->configure(
+            compile_context, _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
+        {
+            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias,
+                                           &_projection_eff_bias, ConvertPolicy::SATURATE);
+        }
     }
 
     // Pre-transpose weights to be used in GEMM.
-    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
-    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
-    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
-    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights,
+                                                 &_input_to_forget_weights_transposed);
+    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights,
+                                               &_input_to_cell_weights_transposed);
+    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights,
+                                                 &_input_to_output_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights,
+                                                   &_recurrent_to_cell_weights_transposed);
+    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
     }
@@ -238,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
                  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr,
+                                           &_cell_to_forget_outstage_res, gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res,
+                                          &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                           ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
         _recurrent_to_forget_outstage_res.allocator()->allocate();
@@ -286,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
-
-    _accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input,
+                 &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res,
+                 &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info);
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
+
+    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
                                                      ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
         _recurrent_to_cell_outstage_res.allocator()->allocate();
@@ -319,122 +447,158 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
-        _input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
+        _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
-
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
         configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
                      output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
-                                                    ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr,
+                                              &_cell_to_input_outstage_res, gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
             _recurrent_to_input_outstage_res.allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
-    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
-    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
+    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f,
+                                        ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
-    _add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out,
+                               ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(compile_context, cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
                  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
                  mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                                 &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
-        // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+        // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr,
+                                           &_cell_to_output_outstage_res, gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                             &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                              ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
         _recurrent_to_output_outstage_res.allocator()->allocate();
@@ -444,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
-    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f,
+                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -466,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -477,60 +645,62 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
-
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
+
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ICLTensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
-            _projection_accumulate_res.allocator()->init(*output_state_out->info());
+            _projection_accumulate_res.allocator()->init(*output_state_in->info());
             _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape());
-            _projection_output_to_accumulate_copy.configure(*output_state_out, _projection_accumulate_res);
+            _projection_output_to_accumulate_copy.configure(*output_state_in, _projection_accumulate_res);
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination,
+                                         accumulate_destination, ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                       quantized_projection_clip));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -541,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     _copy_output.configure(compile_context, output_state_out, output);
 }
 
-Status CLQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status CLQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -563,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -588,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -615,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -623,27 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                               lstm_params.hidden_state_zero(),
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
+        {
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
+        }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1,
+                                              input_to_forget_weights->data_type(),
+                                              input_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     // Validate weights transpose
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
@@ -652,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -673,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -705,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
 
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
-    if(has_layer_norm)
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
+
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -726,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+            input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
-        if(lstm_params.has_peephole_opt())
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
+
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                                  RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_outstage_info, &input_gate_info,
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -812,91 +1089,109 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
-    gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
+    gemmlowp_info.output_data_type = hidden_out_info.data_type();
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(output_state_out, output));
     return Status{};
 }
 
@@ -913,16 +1208,16 @@ void CLQLSTMLayer::run()
 
     _mm_recurrent_to_forget.run();
     _recurrent_to_forget_outstage.run();
-    CLScheduler::get().enqueue(_accumulate_input_recurrent_forget);
+    _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_cell_to_forget);
+        _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
-        CLScheduler::get().enqueue(_accumulate_cell_forget);
+        _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
     }
@@ -935,9 +1230,9 @@ void CLQLSTMLayer::run()
 
     _mm_recurrent_to_cell.run();
     _recurrent_to_cell_outstage.run();
-    CLScheduler::get().enqueue(_accumulate_input_recurrent_modulation);
+    _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
     }
@@ -945,9 +1240,9 @@ void CLQLSTMLayer::run()
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
-        CLScheduler::get().enqueue(_input_gate_sub);
+        _input_gate_sub.run();
     }
     else
     {
@@ -955,16 +1250,16 @@ void CLQLSTMLayer::run()
         _input_to_input_outstage.run();
         _mm_recurrent_to_input.run();
         _recurrent_to_input_outstage.run();
-        CLScheduler::get().enqueue(_accumulate_input_recurrent_input);
+        _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            CLScheduler::get().enqueue(_pixelwise_mul_cell_to_input);
+            _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
-            CLScheduler::get().enqueue(_accumulate_cell_input);
+            _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
         }
@@ -973,10 +1268,10 @@ void CLQLSTMLayer::run()
     }
 
     // Cell.
-    CLScheduler::get().enqueue(_pixelwise_mul_forget_cell);
-    CLScheduler::get().enqueue(_pixelwise_mul_input_cell);
-    CLScheduler::get().enqueue(_add_forget_cell);
-    if(_has_cell_clipping)
+    _pixelwise_mul_forget_cell.run();
+    _pixelwise_mul_input_cell.run();
+    _add_forget_cell.run();
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -986,15 +1281,15 @@ void CLQLSTMLayer::run()
     _input_to_output_outstage.run();
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
-    CLScheduler::get().enqueue(_accumulate_input_recurrent_output);
-    if(_has_peephole)
+    _accumulate_input_recurrent_output.run();
+    if (_has_peephole)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_cell_to_output);
+        _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
-        CLScheduler::get().enqueue(_accumulate_cell_to_output);
+        _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
     }
@@ -1003,47 +1298,47 @@ void CLQLSTMLayer::run()
 
     // Hidden.
     _hidden_tanh.run();
-    CLScheduler::get().enqueue(_pixelwise_mul_hidden);
+    _pixelwise_mul_hidden.run();
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
-        CLScheduler::get().enqueue(_accumulate_projection);
+        _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
     }
 
     // Copy output_state_out to output
-    CLScheduler::get().enqueue(_copy_output);
+    _copy_output.run();
 }
 
 void CLQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Pre-transpose weights to be used in GEMM.
         _input_to_forget_weights_transposed.allocator()->allocate();
@@ -1060,18 +1355,25 @@ void CLQLSTMLayer::prepare()
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
             _ones.map(true);
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
             _ones.unmap();
         }
         else
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            CLScheduler::get().enqueue(_input_to_input_reduction);
-            CLScheduler::get().enqueue(_recurrent_to_input_reduction);
+
+            ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights},
+                                                   {ACL_DST, &_input_to_input_eff_bias}};
+            CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
+
+            ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights},
+                                                 {ACL_DST, &_recurrent_to_input_eff_bias}};
+            CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1086,19 +1388,38 @@ void CLQLSTMLayer::prepare()
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        CLScheduler::get().enqueue(_input_to_forget_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_forget_reduction);
-        CLScheduler::get().enqueue(_input_to_cell_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_cell_reduction);
-        CLScheduler::get().enqueue(_input_to_output_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_output_reduction);
-
-        if(_has_projection)
+
+        ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights},
+                                                {ACL_DST, &_input_to_forget_eff_bias}};
+        CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
+
+        ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights},
+                                              {ACL_DST, &_recurrent_to_forget_eff_bias}};
+        CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
+
+        ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}};
+        CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
+
+        ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights},
+                                            {ACL_DST, &_recurrent_to_cell_eff_bias}};
+        CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
+
+        ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights},
+                                                {ACL_DST, &_input_to_output_eff_bias}};
+        CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
+
+        ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights},
+                                              {ACL_DST, &_recurrent_to_output_eff_bias}};
+        CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
+
+        if (_has_projection)
         {
-            if(_projection_bias != nullptr)
+            _projection_eff_bias.allocator()->allocate();
+            ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}};
+            CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
+            if (_projection_bias != nullptr)
             {
-                _projection_eff_bias.allocator()->allocate();
-                CLScheduler::get().enqueue(_projection_reduction);
+                _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
             }
 
@@ -1106,7 +1427,7 @@ void CLQLSTMLayer::prepare()
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index 6239f279ea..6edef29992 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,26 @@
  */
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClQuantize.h"
 
 namespace arm_compute
 {
+struct CLQuantizationLayer::Impl
+{
+    const ICLTensor                    *src{nullptr};
+    ICLTensor                          *dst{nullptr};
+    std::unique_ptr<opencl::ClQuantize> op{nullptr};
+};
+
+CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>())
+{
+}
+CLQuantizationLayer::~CLQuantizationLayer() = default;
+
 void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -35,13 +50,23 @@ void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLQuantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<opencl::ClQuantize>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 
 Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLQuantizationLayerKernel::validate(input, output);
+    return opencl::ClQuantize::validate(input, output);
+}
+
+void CLQuantizationLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 57b8d70089..34b78eefa7 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,23 +29,44 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <utility>
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_kernel(),
+      _activation(),
+      _fully_connected_kernel(),
+      _copy(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
-Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+CLRNNLayer::~CLRNNLayer() = default;
+
+Status CLRNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, recurrent_weights, bias, hidden_state, output);
+
     const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1));
@@ -55,28 +76,43 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+void CLRNNLayer::configure(const ICLTensor     *input,
+                           const ICLTensor     *weights,
+                           const ICLTensor     *recurrent_weights,
+                           const ICLTensor     *bias,
+                           ICLTensor           *hidden_state,
+                           ICLTensor           *output,
                            ActivationLayerInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state,
+              output, info);
 }
 
-void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                           ICLTensor *hidden_state,
-                           ICLTensor *output, ActivationLayerInfo &info)
+void CLRNNLayer::configure(const CLCompileContext &compile_context,
+                           const ICLTensor        *input,
+                           const ICLTensor        *weights,
+                           const ICLTensor        *recurrent_weights,
+                           const ICLTensor        *bias,
+                           ICLTensor              *hidden_state,
+                           ICLTensor              *output,
+                           ActivationLayerInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
     TensorShape shape      = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
@@ -96,15 +132,15 @@ void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTen
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(compile_context, ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_kernel.configure(compile_context, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
-    _activation_kernel.configure(compile_context, &_add_output, hidden_state, info);
+    _activation.configure(compile_context, &_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel.configure(compile_context, hidden_state, output);
+    _copy.configure(compile_context, hidden_state, output);
 }
 
 void CLRNNLayer::run()
@@ -115,16 +151,16 @@ void CLRNNLayer::run()
 
     _fully_connected_kernel.run();
     _gemm_state_f.run();
-    CLScheduler::get().enqueue(_add_kernel);
-    CLScheduler::get().enqueue(_activation_kernel);
+    _add_kernel.run();
+    _activation.run();
 
     // copy hidden out to output
-    CLScheduler::get().enqueue(_copy_kernel);
+    _copy.run();
 }
 
 void CLRNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected_kernel.prepare();
         _gemm_state_f.prepare();
@@ -132,3 +168,4 @@ void CLRNNLayer::prepare()
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 43b58ddb9b..1939d1d0ba 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,41 @@
 #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 namespace arm_compute
 {
-Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const CLCompileContext    &compile_context,
+                                const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
     // Configure ROI pooling kernel
-    auto k = arm_compute::support::cpp14::make_unique<CLROIAlignLayerKernel>();
+    auto k = std::make_unique<CLROIAlignLayerKernel>();
     k->configure(compile_context, input, rois, output, pool_info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index bb54cfa2ca..0d2eab0c76 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,20 +25,37 @@
 
 #include "arm_compute/core/CL/ICLArray.h"
 
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 using namespace arm_compute;
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   ITensorInfo               *output,
+                                   const ROIPoolingLayerInfo &pool_info)
+{
+    return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info);
+}
+
+void CLROIPoolingLayer::configure(const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  ICLTensor                 *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const CLCompileContext    &compile_context,
+                                  const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  const ICLTensor           *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
     // Configure ROI pooling kernel
-    auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
+    auto k = std::make_unique<CLROIPoolingLayerKernel>();
     k->configure(compile_context, input, rois, output, pool_info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index b29b03d5b5..5c3f7f9c8c 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/functions/CLRange.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 
 using namespace arm_compute;
 
@@ -37,9 +38,11 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRange::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
+    ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
+    auto k = std::make_unique<CLRangeKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, output, start, end, step);
     _kernel = std::move(k);
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index ce447636ec..bef8d887fd 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,23 +23,29 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -47,29 +53,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -78,86 +91,150 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
                 ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
                 const unsigned int remove_index = axis_local[i] - i;
                 ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
-                out_shape.remove_dimension(remove_index);
+                out_shape.remove_dimension(remove_index, false);
             }
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        const bool requant =
+            is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+        if (requant)
+        {
+            TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
+            CLDequantizationLayer::validate(input, &input_no_quant);
+            TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32));
+            CLQuantizationLayer::validate(&output_no_quant, output);
+        }
     }
     return Status{};
 }
-}
+} // namespace
+
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _dequant(),
+      _requant(),
+      _reduction_ops(),
+      _keep_dims(),
+      _do_requant(),
+      _input_no_quant(),
+      _output_no_quant()
 {
 }
+
 void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
 }
 
-void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+void CLReduceMean::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             const Coordinates      &reduction_axis,
+                             bool                    keep_dims,
+                             ICLTensor              *output)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+    ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
+
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
+    _do_requant = is_data_type_quantized(input->info()->data_type()) &&
+                  input->info()->quantization_info() != output->info()->quantization_info();
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims = keep_dims;
 
+    ICLTensor *tmp_input  = input;
+    ICLTensor *tmp_output = output;
+    if (_do_requant)
+    {
+        _memory_group.manage(&_input_no_quant);
+        _memory_group.manage(&_output_no_quant);
+        TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape);
+        output_no_quant_info.set_data_type(DataType::F32);
+        auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info);
+        auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32));
+        _dequant.configure(compile_context, input, &_input_no_quant);
+        tmp_input  = &_input_no_quant;
+        tmp_output = &_output_no_quant;
+    }
+
     Coordinates axis_local = reduction_axis;
-    const int   input_dims = input->info()->num_dimensions();
+    const int   input_dims = tmp_input->info()->num_dimensions();
 
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+        auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(compile_context, in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(),
+                                                          tmp_input->info()->data_type(),
+                                                          tmp_input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
-            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
 
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!_keep_dims)
     {
-        TensorShape out_shape = input->info()->tensor_shape();
+        TensorShape out_shape = tmp_input->info()->tensor_shape();
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+#pragma GCC diagnostic pop
+        for (int i = 0; i < _reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_local[i] - i);
+            out_shape.remove_dimension(axis_local[i] - i, false);
         }
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], output);
+        auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
+        _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
+    }
+    if (_do_requant)
+    {
+        _requant.configure(compile_context, &_output_no_quant, output);
+        _input_no_quant.allocator()->allocate();
+        _output_no_quant.allocator()->allocate();
     }
 }
 
-Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status CLReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -166,14 +243,21 @@ void CLReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    for(auto &kernel : _reduction_kernels)
+    if (_do_requant)
+    {
+        _dequant.run();
+    }
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
+    if (_do_requant)
+    {
+        _requant.run();
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index b659ecfaf6..ba5489018e 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,39 +24,46 @@
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/Utils.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(),
+    : _memory_group(std::move(memory_manager)),
+      _unreshaped_output(),
+      _reduction_kernel(),
+      _reshape(),
+      _reduction_axis(),
       _is_reshape_required(false)
 {
 }
 
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+CLReductionOperation::~CLReductionOperation() = default;
+
+Status CLReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    const unsigned int num_of_stages       = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
-    const bool         is_serial           = needs_serialized_reduction(op, input->data_type(), axis);
-    const bool         is_reshape_required = !keep_dims;
+    const bool is_reshape_required = !keep_dims;
 
-    if(is_reshape_required && output->total_size() != 0)
+    if (is_reshape_required && output->total_size() != 0)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
@@ -64,95 +71,29 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
     TensorInfo output_before_reshape;
     const auto input_shape        = input->tensor_shape();
-    const auto input_data_type    = input->data_type();
     const auto input_num_channles = input->num_channels();
     const auto input_qinfo        = input->quantization_info();
     const auto output_data_type   = output->data_type();
 
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         auto shape_before_reshape = input_shape;
         shape_before_reshape.set(axis, 1);
-        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles,
+                              input_qinfo);
         output_internal = &output_before_reshape;
     }
 
-    if(is_serial)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
-    }
-    else
-    {
-        // Create temporary tensor infos
-        std::vector<TensorInfo> sums_vector(num_of_stages - 1);
-
-        // Create intermediate tensor info
-        TensorShape shape{ input_shape };
-
-        shape.set(0, ceil(shape.x() / 128.f));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
 
-        for(unsigned int i = 0; i < num_of_stages - 1; i++)
-        {
-            initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo);
-        }
-
-        ReductionOperation first_kernel_op;
-        ReductionOperation intermediate_kernel_op;
-        ReductionOperation last_kernel_op;
-        switch(op)
-        {
-            case ReductionOperation::SUM:
-            case ReductionOperation::MEAN_SUM:
-                first_kernel_op        = ReductionOperation::SUM;
-                intermediate_kernel_op = ReductionOperation::SUM;
-                last_kernel_op         = op;
-                break;
-            case ReductionOperation::SUM_SQUARE:
-                first_kernel_op        = ReductionOperation::SUM_SQUARE;
-                intermediate_kernel_op = ReductionOperation::SUM;
-                last_kernel_op         = ReductionOperation::SUM;
-                break;
-            case ReductionOperation::PROD:
-                first_kernel_op        = ReductionOperation::PROD;
-                intermediate_kernel_op = ReductionOperation::PROD;
-                last_kernel_op         = ReductionOperation::PROD;
-                break;
-            case ReductionOperation::MIN:
-                first_kernel_op        = ReductionOperation::MIN;
-                intermediate_kernel_op = ReductionOperation::MIN;
-                last_kernel_op         = ReductionOperation::MIN;
-                break;
-            case ReductionOperation::MAX:
-                first_kernel_op        = ReductionOperation::MAX;
-                intermediate_kernel_op = ReductionOperation::MAX;
-                last_kernel_op         = ReductionOperation::MAX;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-        }
-
-        // Validate ReductionOperation only on first kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
-
-        // Validate ReductionOperation on intermediate stages
-        for(unsigned int i = 1; i < num_of_stages - 1; ++i)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
-        }
-
-        // Validate ReductionOperation on the last stage
-        const unsigned int last_stage = num_of_stages - 1;
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0)));
-    }
-
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(output_internal, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
     }
 
     return Status{};
@@ -160,199 +101,59 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
 ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
 {
-    if(!_is_reshape_required && _is_serial)
+    if (!_is_reshape_required)
     {
         return output;
     }
 
-    auto intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages;
-
-    if(!_is_reshape_required)
-    {
-        --intermediate_result_vector_size;
-    }
-
-    _results_vector.resize(intermediate_result_vector_size);
     auto shape = input->info()->tensor_shape();
-
-    shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f));
-
-    for(auto &v : _results_vector)
-    {
-        if(&v == &_results_vector.back() && _is_reshape_required)
-        {
-            shape.set(_reduction_axis, 1);
-        }
-        v.allocator()->init(input->info()->clone()->set_tensor_shape(shape));
-    }
-
-    return _is_reshape_required ? &_results_vector.back() : output;
+    shape.set(_reduction_axis, 1);
+    _unreshaped_output.allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+    return &_unreshaped_output;
 }
 
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(
+    ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
 }
 
-void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *output,
+                                     unsigned int            axis,
+                                     ReductionOperation      op,
+                                     bool                    keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _op                  = op;
-    _num_of_stages       = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+    ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
     _reduction_axis      = axis;
-    _is_serial           = needs_serialized_reduction(op, input->info()->data_type(), axis);
     _is_reshape_required = !keep_dims;
 
     auto *output_internal = configure_intermediate_result_vector(input, output);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto        output_data_type = input->info()->data_type();
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-    }
-
-    // Configure reduction operation kernels
-    _reduction_kernels_vector.resize(_num_of_stages);
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = input->info()->data_type();
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_tensor_shape(output_shape)
+                                                .set_data_type(output_data_type)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
 
-    // Create temporary tensors
-    if(_is_serial)
-    {
-        if(_is_reshape_required)
-        {
-            _memory_group.manage(&_results_vector.back());
-        }
-
-        _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0);
+        _memory_group.manage(&_unreshaped_output);
     }
-    else
-    {
-        _border_handlers_vector.resize(_num_of_stages);
-        _memory_group.manage(&_results_vector[0]);
 
-        ReductionOperation first_kernel_op;
-        ReductionOperation intermediate_kernel_op;
-        ReductionOperation last_kernel_op;
-        PixelValue         pixelValue;
-        switch(op)
-        {
-            case ReductionOperation::SUM:
-            case ReductionOperation::MEAN_SUM:
-                first_kernel_op        = ReductionOperation::SUM;
-                intermediate_kernel_op = ReductionOperation::SUM;
-                last_kernel_op         = op;
-                pixelValue             = PixelValue();
-                break;
-            case ReductionOperation::SUM_SQUARE:
-                first_kernel_op        = ReductionOperation::SUM_SQUARE;
-                intermediate_kernel_op = ReductionOperation::SUM;
-                last_kernel_op         = ReductionOperation::SUM;
-                pixelValue             = PixelValue();
-                break;
-            case ReductionOperation::PROD:
-                first_kernel_op        = ReductionOperation::PROD;
-                intermediate_kernel_op = ReductionOperation::PROD;
-                last_kernel_op         = ReductionOperation::PROD;
-                pixelValue             = PixelValue(1, input->info()->data_type());
-                break;
-            case ReductionOperation::MIN:
-                first_kernel_op        = ReductionOperation::MIN;
-                intermediate_kernel_op = ReductionOperation::MIN;
-                last_kernel_op         = ReductionOperation::MIN;
-                switch(input->info()->data_type())
-                {
-                    case DataType::F32:
-                    {
-                        pixelValue = PixelValue(std::numeric_limits<float>::max());
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        pixelValue = PixelValue(static_cast<half>(65504.0f));
-                        break;
-                    }
-                    case DataType::QASYMM8:
-                    {
-                        pixelValue = std::get<1>(get_min_max(input->info()->data_type()));
-                        break;
-                    }
-                    case DataType::QASYMM8_SIGNED:
-                    {
-                        pixelValue = PixelValue(127, input->info()->data_type(), input->info()->quantization_info());
-                        break;
-                    }
-                    default:
-                    {
-                        ARM_COMPUTE_ERROR("Unsupported DataType");
-                    }
-                }
-                break;
-            case ReductionOperation::MAX:
-                first_kernel_op        = ReductionOperation::MAX;
-                intermediate_kernel_op = ReductionOperation::MAX;
-                last_kernel_op         = ReductionOperation::MAX;
-                switch(input->info()->data_type())
-                {
-                    case DataType::F32:
-                    {
-                        pixelValue = PixelValue(-std::numeric_limits<float>::max());
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        pixelValue = PixelValue(static_cast<half>(-65504.0f));
-                        break;
-                    }
-                    case DataType::QASYMM8:
-                    {
-                        pixelValue = std::get<0>(get_min_max(input->info()->data_type()));
-                        break;
-                    }
-                    case DataType::QASYMM8_SIGNED:
-                    {
-                        pixelValue = PixelValue(-128, input->info()->data_type(), input->info()->quantization_info());
-                        break;
-                    }
-                    default:
-                    {
-                        ARM_COMPUTE_ERROR("Unsupported DataType");
-                    }
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-        }
+    _reduction_kernel = std::make_unique<CLReductionOperationKernel>();
+    _reduction_kernel->configure(compile_context, input, output_internal, axis, op);
 
-        _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
-        _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
-
-        // Apply ReductionOperation on intermediate stages
-        for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
-        {
-            _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
-            _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
-            _results_vector[i - 1].allocator()->allocate();
-        }
-
-        // Apply ReductionOperation on the last stage
-        const unsigned int last_stage  = _num_of_stages - 1;
-        const unsigned int input_width = input->info()->dimension(0);
-
-        if(_is_reshape_required)
-        {
-            _memory_group.manage(&_results_vector.back());
-        }
-
-        _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
-        _results_vector[last_stage - 1].allocator()->allocate();
-    }
-
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        _reshape_kernel.configure(compile_context, &_results_vector.back(), output);
-        _results_vector.back().allocator()->allocate();
+        _reshape.configure(compile_context, &_unreshaped_output, output);
+        _unreshaped_output.allocator()->allocate();
     }
 }
 
@@ -360,22 +161,11 @@ void CLReductionOperation::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_serial)
-    {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
-    }
-    else
-    {
-        for(unsigned int i = 0; i < _num_of_stages; ++i)
-        {
-            CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-            CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
-        }
-    }
+    CLScheduler::get().enqueue(*_reduction_kernel, false);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        CLScheduler::get().enqueue(_reshape_kernel, false);
+        _reshape.run();
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
deleted file mode 100644
index af241ec299..0000000000
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value);
-}
-
-void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
-                        BorderMode border_mode,
-                        uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
-
-    auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
-    k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index ea9331414c..156e9b90c1 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include <utility>
 
@@ -39,9 +40,13 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *output,
+                             int32_t                 stride)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLReorgLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, stride);
+    auto k = std::make_unique<CLReorgLayerKernel>();
     k->configure(compile_context, input, output, stride);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 13baedb3f9..3d6349fb25 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,31 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClReshape.h"
 
 /** [CLReshapeLayer snippet] **/
-using namespace arm_compute;
+namespace arm_compute
+{
+struct CLReshapeLayer::Impl
+{
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClReshape> op{nullptr};
+};
+
+CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&)            = default;
+CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
+CLReshapeLayer::~CLReshapeLayer()                            = default;
 
 void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
@@ -37,13 +56,26 @@ void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLReshapeLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClReshape>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 
 Status CLReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLReshapeLayerKernel::validate(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClReshape::validate(input, output));
+
+    return Status{};
 }
-/** [CLReshapeLayer snippet] **/
+
+void CLReshapeLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
+  /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 3c8bc15a54..a20be2335a 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,26 +23,35 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 
 namespace arm_compute
 {
-void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, use_inverted_axis);
 }
 
-void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const CLCompileContext &compile_context,
+                          const ICLTensor        *input,
+                          ICLTensor              *output,
+                          const ICLTensor        *axis,
+                          bool                    use_inverted_axis)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
-    k->configure(compile_context, input, output, axis);
+    ARM_COMPUTE_LOG_PARAMS(input, output, axis);
+    auto k = std::make_unique<CLReverseKernel>();
+    k->configure(compile_context, input, output, axis, use_inverted_axis);
     _kernel = std::move(k);
 }
 
-Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status CLReverse::validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           bool               use_inverted_axis)
 {
-    return CLReverseKernel::validate(input, output, axis);
+    return CLReverseKernel::validate(input, output, axis, use_inverted_axis);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index a9395bdc3d..abff0724e4 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,45 +23,54 @@
  */
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/KernelDescriptors.h"
 
-using namespace arm_compute;
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClScale.h"
 
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
-                        bool align_corners)
+namespace arm_compute
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners);
+struct CLScale::Impl
+{
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClScale> op{nullptr};
+};
+
+CLScale::CLScale() : _impl(std::make_unique<Impl>())
+{
+}
+CLScale::~CLScale() = default;
+
+void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,
-                        SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+void CLScale::configure(const CLCompileContext &compile_context,
+                        ICLTensor              *input,
+                        ICLTensor              *output,
+                        const ScaleKernelInfo  &info)
 {
-    ARM_COMPUTE_UNUSED(use_padding);
-    auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
-    k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, input, output, policy, border_mode, sampling_policy, align_corners);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
 
-    // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_kernel);
+    _impl->op = std::make_unique<opencl::ClScale>();
+    _impl->op->configure(compile_context, input->info(), output->info(), info);
+}
 
-    // In the case of NHWC we can't have undefined border mode as this would require to access elements outside z dimension,
-    // so we treat it like border constant.
-    if(border_mode == BorderMode::UNDEFINED && input->info()->data_layout() == DataLayout::NHWC)
-    {
-        border_mode = BorderMode::CONSTANT;
-    }
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, constant_border_value);
+Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
+{
+    return opencl::ClScale::validate(input, output, info);
 }
 
-Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy,
-                         bool use_padding, bool align_corners)
+void CLScale::run()
 {
-    ARM_COMPUTE_UNUSED(constant_border_value, use_padding);
-    return CLScaleKernel::validate(input, output, policy, border_mode, sampling_policy, align_corners);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScatter.cpp b/src/runtime/CL/functions/CLScatter.cpp
new file mode 100644
index 0000000000..e16fcc4ccc
--- /dev/null
+++ b/src/runtime/CL/functions/CLScatter.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLScatter.h"
+
+#include "arm_compute/function_info/ScatterInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/gpu/cl/operators/ClScatter.h"
+
+namespace arm_compute
+{
+using OperatorType = opencl::ClScatter;
+
+struct CLScatter::Impl
+{
+    std::unique_ptr<OperatorType> op{nullptr};
+    ITensorPack                   run_pack{};
+};
+
+CLScatter::CLScatter() : _impl(std::make_unique<Impl>())
+{
+}
+
+CLScatter::~CLScatter() = default;
+
+void CLScatter::configure(const ICLTensor   *src,
+                          const ICLTensor   *updates,
+                          const ICLTensor   *indices,
+                          ICLTensor         *output,
+                          const ScatterInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    configure(CLKernelLibrary::get().get_compile_context(), src, updates, indices, output, info);
+}
+
+void CLScatter::configure(const CLCompileContext &compile_context,
+                          const ICLTensor        *src,
+                          const ICLTensor        *updates,
+                          const ICLTensor        *indices,
+                          ICLTensor              *output,
+                          const ScatterInfo      &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(updates, indices, output);
+
+    _impl->op = std::make_unique<OperatorType>();
+    if (src)
+    { // Src not nullptr.
+        _impl->op->configure(compile_context, src->info(), updates->info(), indices->info(), output->info(), info);
+    }
+    else
+    {
+        _impl->op->configure(compile_context, nullptr, updates->info(), indices->info(), output->info(), info);
+    }
+    _impl->run_pack = {{ACL_SRC_0, src}, {ACL_SRC_1, updates}, {ACL_SRC_2, indices}, {ACL_DST, output}};
+}
+
+Status CLScatter::validate(const ITensorInfo *src,
+                           const ITensorInfo *updates,
+                           const ITensorInfo *indices,
+                           const ITensorInfo *output,
+                           const ScatterInfo &info)
+{
+    return OperatorType::validate(src, updates, indices, output, info);
+}
+
+void CLScatter::run()
+{
+    _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
deleted file mode 100644
index faad5424a2..0000000000
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index 7187010448..b4897d9e62 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSelect.h"
 
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
+
 using namespace arm_compute;
 
 namespace arm_compute
@@ -36,9 +38,14 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor
     configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
 }
 
-void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelect::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *c,
+                         const ICLTensor        *x,
+                         const ICLTensor        *y,
+                         ICLTensor              *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
+    ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
+    auto k = std::make_unique<CLSelectKernel>();
     k->configure(compile_context, c, x, y, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index e8cc0f5499..f79c6a1235 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,43 +24,95 @@
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
-void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
-}
-
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ITensorInfo      *input,
+                        ITensorInfo            *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
-    auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+    auto k = std::make_unique<CLStridedSliceKernel>();
     k->configure(compile_context, input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
     _kernel = std::move(k);
 }
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
     return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
+} // namespace experimental
+
+struct CLSlice::Impl
+{
+    const ICLTensor                       *src{nullptr};
+    ICLTensor                             *dst{nullptr};
+    std::unique_ptr<experimental::CLSlice> op{nullptr};
+};
+
+CLSlice::CLSlice() : _impl(std::make_unique<Impl>())
+{
+}
+CLSlice::CLSlice(CLSlice &&)            = default;
+CLSlice &CLSlice::operator=(CLSlice &&) = default;
+CLSlice::~CLSlice()                     = default;
+
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
+{
+    return experimental::CLSlice::validate(input, output, starts, ends);
+}
+
+void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
+}
+
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<experimental::CLSlice>();
+    _impl->op->configure(compile_context, input->info(), output->info(), starts, ends);
+}
+
+void CLSlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
deleted file mode 100644
index c3604f970f..0000000000
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
-    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
deleted file mode 100644
index f8a33f3fb6..0000000000
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-using namespace arm_compute;
-
-CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
-{
-}
-
-void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void CLSobel5x5::run()
-{
-    CLScheduler::get().enqueue(_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(_sobel_hor, false);
-    CLScheduler::get().enqueue(_sobel_vert);
-}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
deleted file mode 100644
index 6d3c7f0d08..0000000000
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-using namespace arm_compute;
-
-CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
-{
-}
-
-void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
-}
-
-void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void CLSobel7x7::run()
-{
-    CLScheduler::get().enqueue(_border_handler, false);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    CLScheduler::get().enqueue(_sobel_hor, false);
-    CLScheduler::get().enqueue(_sobel_vert);
-}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index b0b2117cd9..2e70e2aa08 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,213 +24,78 @@
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
+#include "src/gpu/cl/operators/ClSoftmax.h"
 
 namespace arm_compute
 {
+using OperatorType = opencl::ClSoftmax;
+
 template <bool IS_LOG>
-CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
-      _needs_flattening(false)
+struct CLSoftmaxLayerGeneric<IS_LOG>::Impl
 {
-}
+    const ICLTensor              *src{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
+    MemoryGroup                   memory_group{};
+    ITensorPack                   run_pack{};
+    WorkspaceData<CLTensor>       workspace_tensors{};
+};
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
+    : _impl(std::make_unique<Impl>())
 {
-    configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+    _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis)
-{
-    // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
-
-    // Initialize the flat input
-    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
-    // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel
-    // If flattening on the third axes, we use CLFlattenKernel.
-    // In all other cases we have to use CLReshapeKernel
-    if(axis != 3)
-    {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>();
-        reshape_kernel_ptr->configure(compile_context, input, &_input_flattened);
-        _flatten_kernel_ptr = std::move(reshape_kernel_ptr);
-    }
-    else
-    {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>();
-        flatten_kernel_ptr->configure(compile_context, input, &_input_flattened);
-        _flatten_kernel_ptr = std::move(flatten_kernel_ptr);
-    }
-
-    // We need to init the output tensor here. Indeed, the reshape kernel expects
-    // both tensors to be already initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-}
+CLSoftmaxLayerGeneric<IS_LOG>::~CLSoftmaxLayerGeneric() = default;
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, beta, axis);
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(
+    const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
-
-    // We don't need flattening only in the case the input is 2D and axis is 1
-    _needs_flattening = axis != 1;
-
-    // If we are dealing with a 4D tensor, we will:
-    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
-    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
-    // - Reshape the flattened output into the real output
-    if(_needs_flattening)
-    {
-        // Add to the memory manager _input_flattened
-        _memory_group.manage(&_input_flattened);
-
-        // Cofigure  _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, axis);
-    }
-
-    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
-    // or it is the original input case (2D case)
-    const ICLTensor *input_2D = (_needs_flattening ? &_input_flattened : input);
-
-    // Create intermediate tensors shapes
-    TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type();
-    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    _tmp.allocator()->init(tensor_info_tmp);
-
-    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
-    max_sum_shape.set(0, 1);
-    _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
-    _sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
-
-    // Set GPU target to kernels
-    _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-    _memory_group.manage(&_max);
-    _memory_group.manage(&_sum);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<OperatorType>();
 
-    SoftmaxKernelInfo softmax_info;
-    softmax_info.beta            = beta;
-    softmax_info.is_log          = IS_LOG;
-    softmax_info.input_data_type = input_2D->info()->data_type();
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis};
+    _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info);
 
-    // Configure kernels
-    _max_shift_exp_sum_kernel.configure(compile_context, input_2D, &_max, &_tmp, &_sum, softmax_info);
-
-    if(_needs_flattening)
-    {
-        // Add to the memory manager _output_flattened
-        _memory_group.manage(&_output_flattened);
-
-        // The normalization kernel stores the result in a flat output tensor
-        _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info);
-
-        // Reshape the flat output into a the requested (4D) output
-        _reshape_kernel.configure(compile_context, &_output_flattened, output);
-
-        // Allocate the intermediate flat tensors
-        _input_flattened.allocator()->allocate();
-        _output_flattened.allocator()->allocate();
-    }
-    else
-    {
-        // Softmax 2D case
-        _norm_kernel.configure(compile_context, &_tmp, &_sum, output, softmax_info);
-    }
-
-    // Allocate intermediate buffers
-    _tmp.allocator()->allocate();
-    _max.allocator()->allocate();
-    _sum.allocator()->allocate();
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+Status
+CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_UNUSED(beta);
-
-    // Create intermediate tensor info
-    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
-    TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = input->tensor_shape();
-    max_sum_shape.set(0, 1);
-    TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
-    TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
-
-    const bool needs_flattening = (axis != 1);
-
-    if(needs_flattening)
-    {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-
-        if(axis != 3)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(input, &tensor_info_flat));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
-        }
-    }
-
-    SoftmaxKernelInfo softmax_info;
-    softmax_info.beta            = beta;
-    softmax_info.is_log          = IS_LOG;
-    softmax_info.input_data_type = input->data_type();
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output, softmax_info));
-
-    if(needs_flattening)
-    {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-    }
-
-    return Status{};
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis};
+    return OperatorType::validate(*input, *output, softmax_info);
 }
 
 template <bool IS_LOG>
-void           CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_needs_flattening)
-    {
-        CLScheduler::get().enqueue(*_flatten_kernel_ptr, false);
-    }
-
-    CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
-    CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening);
-
-    if(_needs_flattening)
-    {
-        CLScheduler::get().enqueue(_reshape_kernel, true);
-    }
+    // Acquire all the temporaries
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    _impl->op->run(_impl->run_pack);
 }
 
 template class CLSoftmaxLayerGeneric<false>;
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 021d31649d..37f728895f 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,61 +30,99 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false)
 {
 }
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
+
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const ICLTensor *block_shape,
+                                    const ICLTensor *paddings,
+                                    ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    const ICLTensor        *paddings,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+    ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const int        block_shape_x,
+                                    const int        block_shape_y,
+                                    const Size2D    &padding_left,
+                                    const Size2D    &padding_right,
+                                    ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                    const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const int               block_shape_x,
+                                    const int               block_shape_y,
+                                    const Size2D           &padding_left,
+                                    const Size2D           &padding_right,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right,
+                                      output);
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
     ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -92,10 +130,11 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
-        CLScheduler::get().enqueue(_memset_kernel, true);
+        //CLScheduler::get().enqueue(*_fill, true);
+        _fill.run();
     }
-    CLScheduler::get().enqueue(_space_to_batch_kernel, true);
+    CLScheduler::get().enqueue(*_space_to_batch_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index a4ffefc189..22695c9ef3 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,21 +30,29 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+
 namespace arm_compute
 {
-CLSpaceToDepthLayer::CLSpaceToDepthLayer()
-    : _space_to_depth_kernel()
+CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
 {
 }
 
+CLSpaceToDepthLayer::~CLSpaceToDepthLayer() = default;
+
 void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
-    _space_to_depth_kernel.configure(compile_context, input, output, block_shape);
+    ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+    _space_to_depth_kernel->configure(compile_context, input, output, block_shape);
 }
 
 Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -54,6 +62,6 @@ Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void CLSpaceToDepthLayer::run()
 {
-    CLScheduler::get().enqueue(_space_to_depth_kernel, true);
+    CLScheduler::get().enqueue(*_space_to_depth_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index cdc44d8373..6be43cc5cd 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,13 +31,15 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 void CLSplit::run()
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 79c3fe5371..c15496fc31 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <complex>
-
 #include "arm_compute/runtime/CL/functions/CLStackLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -33,31 +31,41 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+
+#include <complex>
+
 namespace arm_compute
 {
 CLStackLayer::CLStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _input(), _stack_kernels(), _num_inputs(0)
 {
 }
 
+CLStackLayer::~CLStackLayer() = default;
+
 void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
 }
 
-void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+void CLStackLayer::configure(const CLCompileContext         &compile_context,
+                             const std::vector<ICLTensor *> &input,
+                             int                             axis,
+                             ICLTensor                      *output)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, axis, output);
     _num_inputs = input.size();
-    _stack_kernels.resize(_num_inputs);
+    _stack_kernels.reserve(_num_inputs);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for (unsigned int i = 0; i < _num_inputs; i++)
     {
-        _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output);
+        _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>());
+        _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
     }
 }
 
@@ -72,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
     const unsigned int num_inputs = input.size();
 
-    for(unsigned int i = 0; i < num_inputs; i++)
+    for (unsigned int i = 0; i < num_inputs; i++)
     {
         // All the tensors must have the same rank
         ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -85,9 +93,9 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
 void CLStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for (unsigned i = 0; i < _num_inputs; i++)
     {
-        CLScheduler::get().enqueue(_stack_kernels[i], false);
+        CLScheduler::get().enqueue(*_stack_kernels[i], false);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index 454759664c..c1953cc415 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,32 +23,113 @@
  */
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
 
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
-}
-
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *input,
+                               ITensorInfo            *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto k = std::make_unique<CLStridedSliceKernel>();
     k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     _kernel = std::move(k);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
+} // namespace experimental
+
+struct CLStridedSlice::Impl
+{
+    const ICLTensor                              *src{nullptr};
+    ICLTensor                                    *dst{nullptr};
+    CLRuntimeContext                             *ctx{nullptr};
+    std::unique_ptr<experimental::CLStridedSlice> op{nullptr};
+};
+
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
+{
+    _impl->ctx = ctx;
+}
+
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&)            = default;
+CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
+CLStridedSlice::~CLStridedSlice()                            = default;
+
+void CLStridedSlice::configure(const ICLTensor   *input,
+                               ICLTensor         *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask,
+              shrink_axis_mask);
+}
+
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               ICLTensor              *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<experimental::CLStridedSlice>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask,
+                         end_mask, shrink_axis_mask);
+}
+
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
+{
+    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
+}
+
+void CLStridedSlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
deleted file mode 100644
index 47e15d3c12..0000000000
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
-
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
-}
-
-void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>();
-    k->configure(compile_context, input, lut, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
deleted file mode 100644
index 57c92724fa..0000000000
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLThreshold.h"
-
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper);
-}
-
-void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type,
-                            uint8_t upper)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
-    k->configure(compile_context, input, output, threshold, false_value, true_value, type, upper);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index 178d7af95e..4f86c4adfa 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 
 namespace arm_compute
 {
@@ -33,9 +33,13 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTile::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       const Multiples        &multiples)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
+    auto k = std::make_unique<CLTileKernel>();
     k->configure(compile_context, input, output, multiples);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index f5121d06a5..5a738f47ce 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,26 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 
-#include <utility>
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClTranspose.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+struct CLTranspose::Impl
+{
+    const ICLTensor                     *src{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::ClTranspose> op{nullptr};
+};
+CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>())
+{
+}
+CLTranspose::~CLTranspose() = default;
 
 void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
 {
@@ -37,12 +51,23 @@ void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLTranspose::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<opencl::ClTranspose>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
 }
 
 Status CLTranspose::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLTransposeKernel::validate(input, output);
+    return opencl::ClTranspose::validate(input, output);
+}
+
+void CLTranspose::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 032fb993d0..ddd83e7824 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace
@@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -54,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
 } // namespace
 
 CLUnstack::CLUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
@@ -64,14 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *>
     configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
 }
 
-void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+void CLUnstack::configure(const CLCompileContext         &compile_context,
+                          const ICLTensor                *input,
+                          const std::vector<ICLTensor *> &output_vector,
+                          int                             axis)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ICLTensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -84,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(),
+                                               BiStrides(), 0, slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -103,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
     ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void CLUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }
diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
deleted file mode 100644
index dd04686d60..0000000000
--- a/src/runtime/CL/functions/CLUpsampleLayer.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLUpsampleLayer.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-namespace arm_compute
-{
-CLUpsampleLayer::CLUpsampleLayer() // NOLINT
-    : _upsample(),
-      _output(nullptr)
-{
-}
-
-Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                 const Size2D &info, const InterpolationPolicy upsampling_policy)
-{
-    return CLUpsampleLayerKernel::validate(input, output, info, upsampling_policy);
-}
-
-void CLUpsampleLayer::configure(ICLTensor *input, ICLTensor *output,
-                                const Size2D &info, const InterpolationPolicy upsampling_policy)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, info, upsampling_policy);
-}
-
-void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
-                                const Size2D &info, const InterpolationPolicy upsampling_policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    _output = output;
-    _upsample.configure(compile_context, input, _output, info, upsampling_policy);
-}
-
-void CLUpsampleLayer::run()
-{
-    CLScheduler::get().enqueue(_upsample, false);
-}
-} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
deleted file mode 100644
index ce2171b3d4..0000000000
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
-
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
-}
-
-void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                             uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
-    k->configure(compile_context, input, output, matrix, policy);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
deleted file mode 100644
index 06c06616d0..0000000000
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
-
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
-}
-
-void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                                  uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
-    k->configure(compile_context, input, output, matrix, policy);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 132c3ee926..645f817030 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,221 +23,105 @@
  */
 #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/KernelDescriptors.h"
 
-using namespace arm_compute;
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
+#include "support/Cast.h"
 
-namespace
+namespace arm_compute
 {
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
+struct CLWinogradConvolutionLayer::Impl
 {
-    Size2D output_tile = Size2D{};
-
-    const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
-
-    // Check if the input spatial dimensions are smaller than 4
-    const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
-
-    if(kernel_max_dim == 3U)
-    {
-        if(kernel_dims == Size2D(3U, 3U))
-        {
-            output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
-        }
-        else if(kernel_dims == Size2D(3U, 1U))
-        {
-            output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
-        }
-        else
-        {
-            output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
-        }
-    }
-    else if(kernel_max_dim == 5U)
-    {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
-                             kernel_dims.height == 1 ? 1U : 4U);
-    }
-    else if(kernel_max_dim == 7U)
-    {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
-                             kernel_dims.height == 1 ? 1U : 2U);
-    }
-
-    return output_tile;
-}
-
-bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
-{
-    // Check if we want to configure a Winograd configuration which requires fast math
-    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    std::vector<WinogradConfiguration> fast_math_winograd =
-    {
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
-    };
-
-    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
-                            std::pair<int, int>(kernel_size.width, kernel_size.height));
-
-    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
-}
-} // namespace
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr};
+    ITensorPack                               run_pack{};
+    MemoryGroup                               memory_group{};
+    WorkspaceData<CLTensor>                   workspace_tensors{};
+    bool                                      is_prepared{false};
+};
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
-      _is_prepared(false)
+    : _impl(std::make_unique<Impl>())
 {
+    _impl->memory_group = MemoryGroup(memory_manager);
 }
 
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
+
+void CLWinogradConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info,
-                                           const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLWinogradConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
-    // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-
-    // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->info()->data_layout());
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
-    }
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    input->info()->data_layout());
-
-    _is_prepared      = false;
-    _original_weights = weights;
-
-    // Manage intermediate tensors
-    _memory_group.manage(&_input0);
-    _memory_group.manage(&_batched_mm_output);
-
-    // Do not manage _input1 as it contains the weights
-
-    // Configure input transform
-    _input_transform.configure(compile_context, input, &_input0, winograd_info);
-
-    // Configure filter transform
-    _filter_transform.configure(compile_context, weights, &_input1, winograd_info);
-
-    // Configure batched matrix multiply
-    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
-                                                                                                                  GEMMLowpOutputStageInfo(),
-                                                                                                                  (input->info()->data_type() == DataType::F16)));
-
-    // Configure output transform
-    _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
-
-    // Allocate temporary tensors
-    _input0.allocator()->allocate();
-    _batched_mm_output.allocator()->allocate();
+    _impl->src     = input;
+    _impl->weights = weights;
+    _impl->biases  = biases;
+    _impl->dst     = output;
+
+    _impl->op = std::make_unique<opencl::ClWinogradConv2d>();
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
+                         enable_fast_math);
+
+    _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
+                       {TensorType::ACL_SRC_1, _impl->weights},
+                       {TensorType::ACL_SRC_2, _impl->biases},
+                       {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
-    // Get indeces for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
-    // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
-    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->data_layout());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
-    }
-
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    input->data_layout());
-
-    // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-    const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, winograd_info));
-
-    // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
-
-    // Validate batched matrix multiply
-    TensorShape batched_mm_output_shape = input0.tensor_shape();
-    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
-    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
-                                                                                                                     GEMMLowpOutputStageInfo(), (input->data_type() == DataType::F16))));
-
-    // Configure output transform
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info, act_info));
-
-    return Status{};
+    return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
 
 void CLWinogradConvolutionLayer::run()
 {
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
     prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run input transform
-    _input_transform.run();
-
-    // Run batched matrix multiplication
-    _batched_mm.run();
-
-    // Run output transform
-    CLScheduler::get().enqueue(_output_transform);
+    _impl->op->run(_impl->run_pack);
 }
 
 void CLWinogradConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        // Run filter transform and mark original weights as unused
-        _input1.allocator()->allocate();
-        CLScheduler::get().enqueue(_filter_transform, false);
-        _original_weights->mark_as_unused();
-
-        // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
-        _batched_mm.prepare();
-        if(!_input1.is_used())
-        {
-            _input1.allocator()->free();
-        }
+        _impl->op->prepare(_impl->run_pack);
 
-        CLScheduler::get().queue().finish();
-        _is_prepared = true;
+        // Release Preparation tensors
+        release_prepare_tensors(_impl->workspace_tensors, _impl->run_pack);
+        _impl->run_pack.remove_tensor(TensorType::ACL_SRC_1);
+        _impl->is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
deleted file mode 100644
index ae400768fe..0000000000
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
-#include "arm_compute/core/Error.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
-}
-
-void CLWinogradInputTransform::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
-    k->configure(compile_context, input, output, winograd_info);
-    _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-}
-
-Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, winograd_info));
-    return Status{};
-}
diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
deleted file mode 100644
index 0c0c1065bc..0000000000
--- a/src/runtime/CL/functions/CLYOLOLayer.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLYOLOLayer.h"
-
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
-#include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void CLYOLOLayer::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info, num_classes);
-}
-
-void CLYOLOLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLYOLOLayerKernel>();
-    k->configure(compile_context, input, output, act_info, num_classes);
-    _kernel = std::move(k);
-}
-
-Status CLYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
-    return CLYOLOLayerKernel::validate(input, output, act_info, num_classes);
-}
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
new file mode 100644
index 0000000000..4270165ab4
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
+{
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelectionParams &params)
+{
+    // _target could be used in the future to have a dedicated heuristic for each GPU IP
+    ARM_COMPUTE_UNUSED(_target);
+
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+
+    // Default configurations for Bifrost architectures
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
+
+    // Mali-G71 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
+
+    // Mali-G52 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
+
+    // Mali-G76 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
+
+    const DataType data_type = params.data_type;
+
+    switch (_target)
+    {
+        case GPUTarget::G71:
+            if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
+            {
+                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        case GPUTarget::G76:
+            if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
+            {
+                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        case GPUTarget::G52:
+            if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
+            {
+                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        default:
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            {
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+    }
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+
+    CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE;
+
+    if (is_rhs_constant)
+    {
+        if ((m > 1) && (n < 16))
+        {
+            gemm_type = CLGEMMKernelType::RESHAPED;
+        }
+        else if (m == 1)
+        {
+            gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            if ((k > 256) && (m > 4))
+            {
+                constexpr float alpha = 3.2f;
+                constexpr float fact0 = 1.51f;
+                constexpr float fact1 = 1.66f;
+                constexpr float ops   = 12.0f;
+                const float     scale = k > 1024 ? 1.07f : 1.0f;
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops))
+                                            ? CLGEMMKernelType::RESHAPED
+                                            : CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+        }
+
+        const auto workload = static_cast<float>((m * n) / 20.0f);
+
+        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED
+                                                                                        : gemm_type;
+    }
+
+    return gemm_type;
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(n, k, b);
+
+    if (is_rhs_constant)
+    {
+        if (m == 1)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            return CLGEMMKernelType::RESHAPED;
+        }
+    }
+    else
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b);
+
+    if (is_rhs_constant)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+    else
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+    if (k <= 496)
+    {
+        if (n <= 544)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            return CLGEMMKernelType::RESHAPED;
+        }
+    }
+    else
+    {
+        if (k <= 588)
+        {
+            if (k <= 552)
+            {
+                if (m <= 148)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    if (m <= 278)
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                }
+            }
+            else
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+        }
+        else
+        {
+            return CLGEMMKernelType::RESHAPED;
+        }
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    const float r_mn  = static_cast<float>(m) / static_cast<float>(n);
+    const float r_mk  = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk  = static_cast<float>(n) / static_cast<float>(k);
+    const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k));
+
+    if (r_mn <= 1.5469f)
+    {
+        if (r_mk <= 0.8766f)
+        {
+            if (r_mk <= 0.0211f)
+            {
+                if (r_mnk <= 77.5833f)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+            else
+            {
+                if (r_nk <= 0.0832f)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+        }
+        else
+        {
+            if (r_mnk <= 193.0000f)
+            {
+                if (r_mn <= 0.9948f)
+                {
+                    if (r_mk <= 2.5453f)
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+            }
+            else
+            {
+                return CLGEMMKernelType::RESHAPED;
+            }
+        }
+    }
+    else
+    {
+        if (r_mn <= 17.7370f)
+        {
+            if (r_mnk <= 1391.2875f)
+            {
+                if (r_mk <= 2.9724f)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    if (r_mnk <= 470.0000f)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+            }
+            else
+            {
+                if (r_nk <= 0.1381f)
+                {
+                    if (r_mnk <= 9040.5000f)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+                else
+                {
+                    if (r_mn <= 5.6790f)
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                }
+            }
+        }
+        else
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+    const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+    if (k <= 212)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+    else
+    {
+        if (r_nk <= 0.4990234375f)
+        {
+            if (k <= 1392)
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                if (m <= 325)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+        }
+        else
+        {
+            if (k <= 471)
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                if (r_mn <= 0.04475911520421505f)
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+            }
+        }
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    if (n <= 127.0000f)
+    {
+        if (n <= 63.5000f)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            if (m <= 3616.0000f)
+            {
+                if (b <= 18.5000f)
+                {
+                    if (m <= 2970.5000f)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        if (k <= 104.0000f)
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+            else
+            {
+                return CLGEMMKernelType::RESHAPED;
+            }
+        }
+    }
+    else
+    {
+        if (m <= 12.5000f)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            if (k <= 104.0000f)
+            {
+                if (b <= 18.5000f)
+                {
+                    if (m <= 490.0000f)
+                    {
+                        if (n <= 272.0000f)
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+            else
+            {
+                if (m <= 226.0000f)
+                {
+                    if (n <= 140.0000f)
+                    {
+                        if (m <= 179.5000f)
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+        }
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+    ARM_COMPUTE_UNUSED(n);
+    ARM_COMPUTE_UNUSED(k);
+
+    if (is_rhs_constant)
+    {
+        if (m == 1)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            return CLGEMMKernelType::RESHAPED;
+        }
+    }
+    else
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h
new file mode 100644
index 0000000000..0cbab35c2e
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMDEFAULTTYPEBIFROST_H
+#define SRC_CLGEMMDEFAULTTYPEBIFROST_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMKernel selection */
+class CLGEMMDefaultTypeBifrost final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMDefaultTypeBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMDEFAULTTYPEBIFROST_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
index a94a392553..673038a8db 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
 
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
 #include <map>
 #include <utility>
 
@@ -35,58 +36,59 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-CLGEMMKernelSelectionMidgard::CLGEMMKernelSelectionMidgard(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionMidgard::select_kernel(const CLGEMMKernelSelectionParams &params)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelectionParams &params)
 {
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionMidgard::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Configurations for Midgard architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_configs =
-    {
-        { DataType::F32, &CLGEMMKernelSelectionMidgard::default_f32 },
-        { DataType::F16, &CLGEMMKernelSelectionMidgard::default_f16 },
-        { DataType::QASYMM8, &CLGEMMKernelSelectionMidgard::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionMidgard::default_q8 },
-        { DataType::QSYMM8, &CLGEMMKernelSelectionMidgard::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionMidgard::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    if(gemm_configs.find(data_type) != gemm_configs.end())
+    if (gemm_configs.find(data_type) != gemm_configs.end())
     {
-        return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
+        return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
     }
 
     ARM_COMPUTE_ERROR("Not supported data type");
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(n, k);
+    ARM_COMPUTE_UNUSED(n, k, b);
 
     // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
-    return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
+    return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(n, k);
+    ARM_COMPUTE_UNUSED(n, k, b);
 
     // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
-    return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
+    return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(m, n, k, is_rhs_constant);
+    ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant);
 
     return CLGEMMKernelType::NATIVE;
 }
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h
index 62714fed5c..241072fd58 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,27 +21,33 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h"
+#ifndef SRC_CLGEMMDefaultTypeMidgard_H
+#define SRC_CLGEMMDefaultTypeMidgard_H
 
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
 
 namespace arm_compute
 {
-void CLComputeAllAnchors::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+namespace cl_gemm
 {
-    configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
-}
-
-void CLComputeAllAnchors::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+/** Midgard based OpenCL GEMMKernel selection */
+class CLGEMMDefaultTypeMidgard final : public ICLGEMMKernelSelection
 {
-    // Configure ComputeAllAnchors kernel
-    auto k = arm_compute::support::cpp14::make_unique<CLComputeAllAnchorsKernel>();
-    k->configure(compile_context, anchors, all_anchors, info);
-    _kernel = std::move(k);
-}
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMDefaultTypeMidgard(GPUTarget gpu);
 
-Status CLComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
-{
-    return CLComputeAllAnchorsKernel::validate(anchors, all_anchors, info);
-}
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
 } // namespace arm_compute
+#endif /* SRC_CLGEMMDefaultTypeMidgard_H */
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
new file mode 100644
index 0000000000..851e23bc84
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
+{
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelectionParams &params)
+{
+    // _target could be used in the future to have a dedicated heuristic for each GPU IP
+    ARM_COMPUTE_UNUSED(_target);
+
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+
+    // Default configurations for Valhall architectures
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
+
+    // Mali-G77 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
+
+    // Mali-G78 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
+
+    // Mali-G710 and Mali-G610 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
+
+    // Mali-G715 and Mali-G615 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
+
+    const DataType data_type = params.data_type;
+
+    switch (_target)
+    {
+        case GPUTarget::G710:
+        case GPUTarget::G610:
+            if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
+            {
+                return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        case GPUTarget::G715:
+        case GPUTarget::G615:
+            if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
+            {
+                return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        case GPUTarget::G78:
+            if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
+            {
+                return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        case GPUTarget::G77:
+            if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
+            {
+                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        default:
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            {
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+    }
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b);
+
+    return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b);
+
+    return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b);
+
+    return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b);
+
+    return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
+}
+
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b);
+
+    if (is_rhs_constant)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+    else
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    if (n <= 272.0000f)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+    else
+    {
+        if (k <= 471.0000f)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            if (m <= 72.5000f)
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                if (m <= 90.5000f)
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+                else
+                {
+                    if (k <= 2448.0000f)
+                    {
+                        if (n <= 756.0000f)
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+            }
+        }
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b);
+
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+
+    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    if (!is_rhs_constant)
+    {
+        return default_f32(m, n, k, b, is_rhs_constant);
+    }
+
+    unsigned int best_m0;
+    unsigned int best_n0;
+
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
+    }
+    else
+    {
+        return default_f32(m, n, k, b, is_rhs_constant);
+    }
+}
+
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    if (!is_rhs_constant)
+    {
+        return g78_f16(m, n, k, b, is_rhs_constant);
+    }
+
+    unsigned int best_m0;
+    unsigned int best_n0;
+
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
+    }
+    else
+    {
+        return g78_f16(m, n, k, b, is_rhs_constant);
+    }
+}
+
+} // namespace cl_gemm
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h
new file mode 100644
index 0000000000..e190295ee4
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMDEFAULTTYPEVALHALL_H
+#define SRC_CLGEMMDEFAULTTYPEVALHALL_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Valhall based OpenCL GEMMKernel selection */
+class CLGEMMDefaultTypeValhall final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMDefaultTypeValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMDEFAULTTYPEVALHALL_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
new file mode 100644
index 0000000000..98dd44b1bf
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
+#define ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h"
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMKernelSelection factory class */
+class CLGEMMKernelSelectionFactory final
+{
+public:
+    /** Static method to select the GEMM kernel accordingly with the GPU target and GEMM's dimensionality
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMKernelSelection class
+     */
+    static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu);
+            case GPUTarget::BIFROST:
+                return std::make_unique<CLGEMMDefaultTypeBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<CLGEMMDefaultTypeValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_CL_GEMM_CLGEMMKERNELSELECTION_H
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
deleted file mode 100644
index 041e7d6cb4..0000000000
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-
-#include <map>
-#include <utility>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-CLGEMMKernelSelectionBifrost::CLGEMMKernelSelectionBifrost(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
-{
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::select_kernel(const CLGEMMKernelSelectionParams &params)
-{
-    // _target could be used in the future to have a dedicated heuristic for each GPU IP
-    ARM_COMPUTE_UNUSED(_target);
-
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionBifrost::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-
-    // Default configurations for Bifrost architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
-    {
-        { DataType::F32, &CLGEMMKernelSelectionBifrost::default_f32 },
-        { DataType::F16, &CLGEMMKernelSelectionBifrost::default_f16 },
-        { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 }
-    };
-
-    // Mali-G71 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs =
-    {
-        { DataType::F32, &CLGEMMKernelSelectionBifrost::default_f32 },
-        { DataType::F16, &CLGEMMKernelSelectionBifrost::g71_f16 },
-        { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 }
-    };
-
-    // Mali-G76 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs =
-    {
-        { DataType::F32, &CLGEMMKernelSelectionBifrost::g76_f32 },
-        { DataType::F16, &CLGEMMKernelSelectionBifrost::default_f16 },
-        { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 }
-    };
-
-    const DataType data_type = params.data_type;
-
-    switch(_target)
-    {
-        case GPUTarget::G71:
-            if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
-            {
-                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
-            }
-            ARM_COMPUTE_ERROR("Not supported data type");
-        case GPUTarget::G76:
-            if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
-            {
-                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
-            }
-            ARM_COMPUTE_ERROR("Not supported data type");
-        default:
-            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
-            {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
-            }
-            ARM_COMPUTE_ERROR("Not supported data type");
-    }
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1;
-
-    if(is_rhs_constant)
-    {
-        if((m > 1) && (n < 16))
-        {
-            gemm_type = CLGEMMKernelType::RESHAPED_V1;
-        }
-        else if(m == 1)
-        {
-            gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
-        }
-        else
-        {
-            if((k > 256) && (m > 4))
-            {
-                constexpr float alpha = 3.2f;
-                constexpr float fact0 = 1.51f;
-                constexpr float fact1 = 1.66f;
-                constexpr float ops   = 12.0f;
-                const float     scale = k > 1024 ? 1.07f : 1.0f;
-                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
-            }
-            else
-            {
-                gemm_type = CLGEMMKernelType::NATIVE_V1;
-            }
-        }
-
-        const auto workload = static_cast<float>((m * n) / 20.0f);
-
-        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED_V1)) ? CLGEMMKernelType::RESHAPED : gemm_type;
-    }
-
-    return gemm_type;
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    ARM_COMPUTE_UNUSED(n, k);
-    if(is_rhs_constant)
-    {
-        if(m == 1)
-        {
-            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
-        }
-        else
-        {
-            return CLGEMMKernelType::RESHAPED;
-        }
-    }
-    else
-    {
-        return CLGEMMKernelType::NATIVE_V1;
-    }
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    ARM_COMPUTE_UNUSED(m, n, k);
-
-    if(is_rhs_constant)
-    {
-        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
-    }
-    else
-    {
-        return CLGEMMKernelType::NATIVE;
-    }
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1;
-
-    if(is_rhs_constant)
-    {
-        if((m > 1) && (n < 16))
-        {
-            gemm_type = CLGEMMKernelType::RESHAPED;
-        }
-        else if(m == 1)
-        {
-            gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
-        }
-        else
-        {
-            if((k > 256) && (m > 4))
-            {
-                gemm_type = CLGEMMKernelType::RESHAPED;
-            }
-            else
-            {
-                gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
-            }
-        }
-    }
-
-    return gemm_type;
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    if(is_rhs_constant)
-    {
-        if(m == 1)
-        {
-            if(n > k)
-            {
-                return CLGEMMKernelType::NATIVE_V1;
-            }
-            else
-            {
-                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
-            }
-        }
-        else
-        {
-            return CLGEMMKernelType::RESHAPED;
-        }
-    }
-    else
-    {
-        return CLGEMMKernelType::NATIVE_V1;
-    }
-}
-} // namespace cl_gemm
-} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
deleted file mode 100644
index 775bb9bffd..0000000000
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-
-#include <map>
-#include <utility>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-CLGEMMKernelSelectionValhall::CLGEMMKernelSelectionValhall(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
-{
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionValhall::select_kernel(const CLGEMMKernelSelectionParams &params)
-{
-    // _target could be used in the future to have a dedicated heuristic for each GPU IP
-    ARM_COMPUTE_UNUSED(_target);
-
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionValhall::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-
-    // Configurations for Valhall architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_configs =
-    {
-        { DataType::F32, &CLGEMMKernelSelectionValhall::default_f32 },
-        { DataType::F16, &CLGEMMKernelSelectionValhall::default_f16 },
-        { DataType::QASYMM8, &CLGEMMKernelSelectionValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMKernelSelectionValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionValhall::default_q8 }
-    };
-
-    const DataType data_type = params.data_type;
-
-    if(gemm_configs.find(data_type) != gemm_configs.end())
-    {
-        return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
-    }
-
-    ARM_COMPUTE_ERROR("Not supported data type");
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    ARM_COMPUTE_UNUSED(m, n, k);
-
-    return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1;
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    ARM_COMPUTE_UNUSED(m, n, k);
-
-    return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1;
-}
-
-CLGEMMKernelType CLGEMMKernelSelectionValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
-{
-    ARM_COMPUTE_UNUSED(m, n, k);
-
-    if(is_rhs_constant)
-    {
-        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
-    }
-    else
-    {
-        return CLGEMMKernelType::NATIVE;
-    }
-}
-} // namespace cl_gemm
-} // namespace arm_compute
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
new file mode 100644
index 0000000000..8df57197e2
--- /dev/null
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
+
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
+#include "src/runtime/CL/mlgo/Utils.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+namespace auto_heuristics
+{
+using namespace arm_compute::opencl::kernels::gemm;
+
+GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
+{
+    ARM_COMPUTE_UNUSED(reshape_b_only_on_first_run);
+    bool             valid = false;
+    CLGEMMKernelType gemm_type{};
+    const auto       mlgo_heuristics = CLScheduler::get().gemm_heuristics();
+    if (mlgo_heuristics != nullptr)
+    {
+        std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
+    }
+    if (valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.",
+                                                  to_string(gemm_type).c_str());
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
+    }
+    return GEMMTypeResult(valid, gemm_type);
+}
+
+GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
+{
+    std::unique_ptr<ICLGEMMKernelSelection> default_heuristics = CLGEMMKernelSelectionFactory::create(query.gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(default_heuristics.get());
+
+    CLGEMMKernelSelectionParams params;
+    params.m               = query.m;
+    params.n               = query.n;
+    params.k               = query.k;
+    params.b               = query.b;
+    params.is_rhs_constant = reshape_b_only_on_first_run;
+    params.data_type       = query.data_type;
+
+    const auto kernel_type = default_heuristics->select_kernel(params);
+    return GEMMTypeResult(true, kernel_type);
+}
+
+GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query)
+{
+    GEMMLHSMatrixInfo                    lhs_info;
+    GEMMRHSMatrixInfo                    rhs_info;
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
+    return GEMMConfigResult{true, lhs_info, rhs_info};
+}
+
+GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
+{
+    bool                            valid = false;
+    GEMMLHSMatrixInfo               lhs_info;
+    GEMMRHSMatrixInfo               rhs_info;
+    mlgo::GEMMConfigReshapedOnlyRHS config{};
+    const auto                      mlgo_heuristics = CLScheduler::get().gemm_heuristics();
+    if (mlgo_heuristics != nullptr)
+    {
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
+    }
+    if (valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
+        // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs,
+            !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
+    }
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
+}
+
+GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
+{
+    GEMMLHSMatrixInfo                    lhs_info;
+    GEMMRHSMatrixInfo                    rhs_info;
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
+    return GEMMConfigResult{true, lhs_info, rhs_info};
+}
+
+GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
+{
+    bool                     valid = false;
+    GEMMLHSMatrixInfo        lhs_info;
+    GEMMRHSMatrixInfo        rhs_info;
+    mlgo::GEMMConfigReshaped config{};
+    const auto               mlgo_heuristics = CLScheduler::get().gemm_heuristics();
+    if (mlgo_heuristics != nullptr)
+    {
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
+    }
+    if (valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs,
+            config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
+    }
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
+}
+
+GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
+{
+    GEMMLHSMatrixInfo                    lhs_info;
+    GEMMRHSMatrixInfo                    rhs_info;
+    std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
+    return GEMMConfigResult{true, lhs_info, rhs_info};
+}
+
+GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
+{
+    bool                   valid = false;
+    GEMMLHSMatrixInfo      lhs_info;
+    GEMMRHSMatrixInfo      rhs_info;
+    mlgo::GEMMConfigNative config{};
+    const auto             mlgo_heuristics = CLScheduler::get().gemm_heuristics();
+    if (mlgo_heuristics != nullptr)
+    {
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
+    }
+    if (valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
+        // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
+        std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
+    }
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
+}
+} // namespace auto_heuristics
+
+} // namespace cl_gemm
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
new file mode 100644
index 0000000000..f544715e03
--- /dev/null
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
+#define SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+namespace auto_heuristics
+{
+/** A collection of adaptor functions that enable the auto selection between mlgo-based heuristics and default heuristics */
+
+/** Common query */
+struct CommonQuery
+{
+    GPUTarget    gpu_target; /**< Which @ref GPUTarget to query about */
+    DataType     data_type;  /**< Data type */
+    unsigned int m;          /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+    unsigned int n;          /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int k;          /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int b;          /**< Batch size */
+};
+
+/** Result of querying about GEMM type ( @ref CLGEMMKernelType) */
+struct GEMMTypeResult
+{
+    GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type}
+    {
+    }
+    /** Test if the result is valid */
+    operator bool() const
+    {
+        return valid;
+    }
+    bool             valid;     /** If the result is valid */
+    CLGEMMKernelType gemm_type; /** @ref CLGEMMKernelType */
+};
+
+/** Result of querying about GEMM config ( @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo) */
+struct GEMMConfigResult
+{
+    GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info)
+        : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info}
+    {
+    }
+    /** Test if the result is valid */
+    operator bool() const
+    {
+        return valid;
+    }
+    bool              valid;    /** If the result is valid */
+    GEMMLHSMatrixInfo lhs_info; /** @ref GEMMLHSMatrixInfo */
+    GEMMRHSMatrixInfo rhs_info; /** @ref GEMMRHSMatrixInfo */
+};
+
+/** Select gemm type based on mlgo heuristics
+ * @param query  Query
+ * @param reshape_b_only_on_first_run Additional query parameter if reshape b only on first run
+ * @return GEMMTypeResult. Result is valid if bool(GEMMTypeResult) == true and invalid otherwise
+ */
+GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run);
+
+/** Select gemm type based on default heuristics
+ * @param query  Query
+ * @param reshape_b_only_on_first_run Additional query parameter if reshape b only on first run
+ * @return GEMMTypeResult. Result is valid if bool(GEMMTypeResult) == true and invalid otherwise
+ */
+GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run);
+
+/** Select gemm config based on mlgo heuristics
+ * @param query Query
+ * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise
+ */
+GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query);
+
+/** Select gemm config based on default heuristics
+ * @param query Query
+ * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise
+ */
+GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query);
+
+/** Select gemm config based on mlgo heuristics
+ * @param query Query
+ * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise
+ */
+GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query);
+
+/** Select gemm config based on default heuristics
+ * @param query Query
+ * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise
+ */
+GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query);
+
+/** Select gemm config based on mlgo heuristics
+ * @param query Query
+ * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise
+ */
+GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query);
+
+/** Select gemm config based on default heuristics
+ * @param query Query
+ * @return GEMMConfigResult. Result is valid if bool(GEMMConfigResult) == true and invalid otherwise
+ */
+GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query);
+
+} // namespace auto_heuristics
+} // namespace cl_gemm
+} // namespace arm_compute
+
+#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h
new file mode 100644
index 0000000000..08a7ee8c18
--- /dev/null
+++ b/src/runtime/CL/mlgo/Common.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_CL_MLGO_COMMON_H
+#define SRC_RUNTIME_CL_MLGO_COMMON_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+namespace arm_compute
+{
+namespace mlgo
+{
+/** Types of Heuristic (tree) */
+enum class HeuristicType
+{
+    GEMM_Type,                     /**< About the type of gemm */
+    GEMM_Config_Native,            /**< About the gemm config for native kernel */
+    GEMM_Config_Reshaped_Only_RHS, /**< About the gemm config for reshaped only rhs kernel */
+    GEMM_Config_Reshaped           /**< About the gemm config for reshaped kernel */
+};
+
+using GEMMType = CLGEMMKernelType;
+
+/** GEMM Configuration for Native kernel */
+struct GEMMConfigNative
+{
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+};
+
+/** GEMM Configuration for Reshaped Only RHS kernel */
+struct GEMMConfigReshapedOnlyRHS
+{
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+};
+
+/** GEMM Configuration for Reshaped kernel */
+struct GEMMConfigReshaped
+{
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+};
+
+} // namespace mlgo
+} // namespace arm_compute
+#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp
new file mode 100644
index 0000000000..f7b706902b
--- /dev/null
+++ b/src/runtime/CL/mlgo/HeuristicTree.cpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/CL/mlgo/HeuristicTree.h"
+
+#include "arm_compute/core/Log.h"
+
+#include "support/Cast.h"
+
+#include <algorithm>
+#include <deque>
+#include <set>
+namespace arm_compute
+{
+namespace mlgo
+{
+namespace
+{
+bool evaluate(GEMMShape shape, Condition cond)
+{
+    // PRE: all features and ConditionalOps are valid
+    constexpr float eps = 0.0001f;
+    // Calculate all secondary features
+    std::vector<std::pair<std::string, float>> cond_values{
+        {"m", static_cast<float>(shape.m)},
+        {"n", static_cast<float>(shape.n)},
+        {"k", static_cast<float>(shape.k)},
+        {"b", static_cast<float>(shape.b)},
+        {"r_mn", static_cast<float>(shape.m) / shape.n},
+        {"r_mk", static_cast<float>(shape.m) / shape.k},
+        {"r_nk", static_cast<float>(shape.n) / shape.k},
+        {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)},
+        {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}};
+    auto cond_value_pair_it =
+        std::find_if(cond_values.begin(), cond_values.end(),
+                     [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; });
+
+    ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end());
+    const float cond_value = cond_value_pair_it->second;
+    switch (cond.op)
+    {
+        case ConditionalOp::LT:
+        {
+            return cond_value < cond.threshold;
+        }
+        case ConditionalOp::LE:
+        {
+            return cond_value <= cond.threshold;
+        }
+        case ConditionalOp::GT:
+        {
+            return cond_value > cond.threshold;
+        }
+        case ConditionalOp::GE:
+        {
+            return cond_value >= cond.threshold;
+        }
+        case ConditionalOp::EQ:
+        default:
+        {
+            return std::abs(cond_value - cond.threshold) < eps;
+        }
+    }
+}
+
+} // namespace
+
+constexpr size_t                HeuristicTree::_max_num_nodes;
+constexpr size_t                HeuristicTree::_max_query_depth;
+constexpr HeuristicTree::NodeID HeuristicTree::_root;
+
+HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
+{
+}
+
+HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type)
+    : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{}
+{
+}
+
+template <typename T>
+std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
+{
+    // Root ID = 0;
+    auto   cur_node = _tree.at(_root).get();
+    size_t depth    = 0;
+    while (cur_node->type() != NodeType::Leaf)
+    {
+        if (depth > _max_query_depth)
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?",
+                                                      _max_query_depth);
+            return std::make_pair(false, T{});
+        }
+        ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType");
+        auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
+        if (evaluate(shape, br_node->condition))
+        {
+            cur_node = _tree.at(br_node->true_node).get();
+        }
+        else
+        {
+            cur_node = _tree.at(br_node->false_node).get();
+        }
+        ++depth;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Leaf, "Unexpected NodeType");
+    auto l_node = utils::cast::polymorphic_downcast<LeafNode<T> *>(cur_node);
+    return std::make_pair(true, l_node->value);
+}
+
+template <typename T>
+bool HeuristicTree::add_leaf(NodeID id, T val)
+{
+    if (_tree.size() >= _max_num_nodes)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
+        return false;
+    }
+    if (_tree.find(id) != _tree.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
+        return false;
+    }
+    _tree[id] = std::make_unique<LeafNode<T>>(id, val);
+    return true;
+}
+
+bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
+{
+    if (_tree.size() >= _max_num_nodes)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
+        return false;
+    }
+
+    const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"};
+    const auto                  orig_feature       = cond.feature;
+    std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(),
+                   [](char c) { return std::tolower(c); });
+    if (supported_features.find(cond.feature) == supported_features.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str());
+        return false;
+    }
+
+    if (_tree.find(id) != _tree.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
+        return false;
+    }
+    _tree[id] = std::make_unique<BranchNode>(id, cond, t_node, f_node);
+    return true;
+}
+
+bool HeuristicTree::check_if_structurally_correct() const
+{
+    std::set<NodeID>   visited;
+    std::deque<NodeID> to_visit{_root};
+
+    while (!to_visit.empty())
+    {
+        auto id = to_visit.front();
+        to_visit.pop_front();
+        if (_tree.find(id) == _tree.end())
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id);
+            return false;
+        }
+        auto not_seen_before = visited.insert(id);
+        if (!not_seen_before.second)
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops");
+            return false;
+        }
+        auto cur_node = _tree.at(id).get();
+        if (cur_node->type() == NodeType::Branch)
+        {
+            auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
+            to_visit.push_back(br_node->true_node);
+            to_visit.push_back(br_node->false_node);
+        }
+    }
+    if (visited.size() != _tree.size())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes");
+        return false;
+    }
+    return true;
+}
+
+bool HeuristicTree::check()
+{
+    if (_tree.empty())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered");
+        return false;
+    }
+    if (_tree.find(_root) == _tree.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root);
+        return false;
+    }
+    return check_if_structurally_correct();
+}
+
+/** Explicit template instantiation @relates HeuristicTree */
+template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shape) const;
+/** Explicit template instantiation @relates HeuristicTree */
+template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const;
+/** Explicit template instantiation @relates HeuristicTree */
+template std::pair<bool, GEMMConfigReshapedOnlyRHS>
+HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
+/** Explicit template instantiation @relates HeuristicTree */
+template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const;
+
+/** Explicit template instantiation @relates HeuristicTree */
+template bool HeuristicTree::add_leaf(NodeID id, GEMMType val);
+/** Explicit template instantiation @relates HeuristicTree */
+template bool HeuristicTree::add_leaf(NodeID id, GEMMConfigNative val);
+/** Explicit template instantiation @relates HeuristicTree */
+template bool HeuristicTree::add_leaf(NodeID id, GEMMConfigReshapedOnlyRHS val);
+/** Explicit template instantiation @relates HeuristicTree */
+template bool HeuristicTree::add_leaf(NodeID id, GEMMConfigReshaped val);
+
+} // namespace mlgo
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h
new file mode 100644
index 0000000000..a4f8c116b9
--- /dev/null
+++ b/src/runtime/CL/mlgo/HeuristicTree.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
+#define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
+
+#include "arm_compute/core/Types.h"
+
+#include "src/runtime/CL/mlgo/Common.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+namespace mlgo
+{
+/** Conditional ops */
+enum class ConditionalOp
+{
+    EQ, /**< Equal */
+    LT, /**< Less than */
+    LE, /**< Less than or equal to */
+    GT, /**< Greater than */
+    GE, /**< Greater than or equal to */
+};
+
+/** A branch condition expression evaluating: feature op threshold */
+struct Condition
+{
+    std::string   feature;   /**< Feature name */
+    ConditionalOp op;        /**< Condtional op */
+    float         threshold; /**< Threshold value */
+};
+
+/** GEMM Shape used for query */
+struct GEMMShape
+{
+    unsigned int m; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+    unsigned int n; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int k; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int b; /**< Batch size */
+};
+
+/** A binary decision tree based heuristic */
+class HeuristicTree
+{
+public:
+    using NodeID = size_t;
+    using TreeID = size_t;
+    using Index  = std::tuple<HeuristicType, std::string, DataType>;
+    enum class NodeType
+    {
+        Branch,
+        Leaf
+    };
+    struct Node
+    {
+        virtual NodeType type() const = 0;
+        virtual ~Node()               = default;
+    };
+
+    struct BranchNode : public Node
+    {
+        BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
+            : id{id}, condition{cond}, true_node{t_node}, false_node{f_node}
+        {
+        }
+        NodeType type() const override
+        {
+            return NodeType::Branch;
+        }
+        NodeID    id;
+        Condition condition;
+        NodeID    true_node;
+        NodeID    false_node;
+    };
+
+    template <typename T>
+    struct LeafNode : public Node
+    {
+        LeafNode(NodeID id, T val) : id{id}, value{val}
+        {
+        }
+        NodeType type() const override
+        {
+            return NodeType::Leaf;
+        }
+        NodeID id;
+        T      value;
+    };
+
+public:
+    /** Constructor */
+    HeuristicTree();
+    /** Constructor */
+    HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type);
+    // Since the HeuristicTree is a handle that owns the the nodes, it is move-only
+    /** Prevent copy construction */
+    HeuristicTree(const HeuristicTree &) = delete;
+    /** Prevent copy assignment */
+    HeuristicTree &operator=(const HeuristicTree &) = delete;
+    /** Move constructor */
+    HeuristicTree(HeuristicTree &&other) noexcept = default;
+    /** Move assignment */
+    HeuristicTree &operator=(HeuristicTree &&other) = default;
+
+    /** Query a leaf value given a gemm shape
+     *
+     * @tparam T           Leaf value type
+     * @param shape        A @ref GEMMShape for the query
+     * @return std::pair<bool, T> Outcome contains bool, signalling if the query succeeded or not
+     */
+    template <typename T>
+    std::pair<bool, T> query(GEMMShape shape) const;
+
+    /** Add a leaf node
+     *
+     * @tparam T  Leaf value type
+     * @param id  Leaf node ID
+     * @param leaf_value Leaf node value
+     * @return bool  If the addition succeeded or not
+     */
+    template <typename T>
+    bool add_leaf(NodeID id, T leaf_value);
+    /** Add a branch node
+     *
+     * @param id  Branch node ID
+     * @param cond Branch node @ref Condition
+     * @param true_node  True node's ID
+     * @param false_node  False node's ID
+     * @return bool  If the addition succeeded or not
+     */
+    bool add_branch(NodeID id, Condition cond, NodeID true_node, NodeID false_node);
+
+    /** Get tree ID
+     * @return TreeID
+     */
+    TreeID id() const
+    {
+        return _id;
+    }
+
+    /** Get tree index
+     * @return Index
+     */
+    Index index() const
+    {
+        return std::make_tuple(_heuristic_type, _ip_target, _data_type);
+    }
+
+    /** Check if tree is valid
+     * @return bool
+     */
+    bool check();
+
+private:
+    static constexpr size_t _max_query_depth{1000}; // Maximum depth of query
+    static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree
+    static constexpr NodeID _root{0};               // Root tree ID
+
+private:
+    bool check_if_structurally_correct() const;
+
+private:
+    TreeID                                  _id;             /**< Heuristic tree ID */
+    HeuristicType                           _heuristic_type; /**< Heuristic type */
+    std::string                             _ip_target;      /**< IP target associated with the tree */
+    DataType                                _data_type;      /**< Data type associated with the tree */
+    std::map<NodeID, std::unique_ptr<Node>> _tree;           /**< Tree representation */
+};
+} // namespace mlgo
+
+} // namespace arm_compute
+
+#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
new file mode 100644
index 0000000000..aed46cd80f
--- /dev/null
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
+
+#include "arm_compute/core/Log.h"
+
+#include "src/runtime/CL/mlgo/MLGOParser.h"
+#include "src/runtime/CL/mlgo/Utils.h"
+
+#include <fstream>
+
+namespace arm_compute
+{
+namespace mlgo
+{
+bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs)
+{
+    return std::tie(lhs.m0, lhs.n0, lhs.k0) == std::tie(rhs.m0, rhs.n0, rhs.k0);
+}
+bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs)
+{
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) ==
+           std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
+}
+bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs)
+{
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs,
+                    lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs,
+                                                     rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
+}
+
+constexpr size_t MLGOHeuristics::_max_num_trees;
+
+MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false}
+{
+}
+
+std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) const
+{
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str());
+    const auto invalid = GEMMType::RESHAPED;
+    if (!_valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
+        return {false, invalid};
+    }
+    auto      index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type);
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
+        return {false, invalid};
+    }
+    return _trees.at(index).query<GEMMType>(shape_query);
+}
+std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const
+{
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.",
+                                              to_string(query).c_str());
+    const auto invalid = GEMMConfigNative{};
+    if (!_valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
+        return {false, invalid};
+    }
+    auto      index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type);
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
+        return {false, invalid};
+    }
+    return _trees.at(index).query<GEMMConfigNative>(shape_query);
+}
+std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const
+{
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.",
+                                              to_string(query).c_str());
+    const auto invalid = GEMMConfigReshapedOnlyRHS{};
+    if (!_valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
+        return {false, invalid};
+    }
+    auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type);
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
+        return {false, invalid};
+    }
+    return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query);
+}
+std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const
+{
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.",
+                                              to_string(query).c_str());
+    const auto invalid = GEMMConfigReshaped{};
+    if (!_valid)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
+        return {false, invalid};
+    }
+    auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type);
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
+        return {false, invalid};
+    }
+    return _trees.at(index).query<GEMMConfigReshaped>(shape_query);
+}
+
+bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
+{
+    bool           status;
+    HeuristicTree *tree{nullptr};
+    std::tie(status, tree) = get_heuristic_tree(id);
+    if (!status)
+    {
+        return status;
+    }
+    status = tree->check();
+    if (!status)
+    {
+        return status;
+    }
+    _tree_valid[id] = true;
+    return true;
+}
+
+bool MLGOHeuristics::check_all() const
+{
+    // Tree validities are already checked and cached.
+    bool all_trees_are_checked =
+        std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end();
+    if (!all_trees_are_checked)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each "
+                                      "tree is completed. This could also indicate there are no trees in the dotmlgo");
+        return false;
+    }
+
+    // Other top level checks...
+
+    return true;
+}
+
+std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id)
+{
+    if (_indices.find(id) == _indices.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id);
+        return std::make_pair(false, nullptr);
+    }
+    const auto index = _indices[id];
+
+    if (_trees.find(index) == _trees.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
+        return std::make_pair(false, nullptr);
+    }
+    auto &t = _trees[index];
+
+    return std::make_pair(true, &t);
+}
+
+bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
+{
+    if (_indices.size() >= _max_num_trees)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees);
+        return false;
+    }
+    // PRE: correctness of t is guaranteed by the tree construction process
+    // Ensure unique id
+    const auto id = t.id();
+    if (_indices.find(id) != _indices.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id);
+        return false;
+    }
+
+    // Ensure unique index
+    const auto index = t.index();
+    if (_trees.find(index) != _trees.end())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists");
+        return false;
+    }
+
+    _indices[id]    = index;
+    _trees[index]   = std::move(t);
+    _tree_valid[id] = false;
+    return true;
+}
+
+bool MLGOHeuristics::reload_from_file(const std::string &filename)
+{
+    std::ifstream fs;
+    fs.exceptions(std::ifstream::badbit);
+    fs.open(filename, std::ios::in);
+    if (!fs.is_open())
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead",
+                                                  filename.c_str());
+        return _valid = false;
+    }
+    return reload_from_stream(fs);
+}
+
+bool MLGOHeuristics::reload_from_stream(std::istream &in)
+{
+    auto parsed = parser::parse_mlgo(in);
+    if (!parsed.first)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead");
+        return _valid = false;
+    }
+    *this = std::move(parsed.second);
+    ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO loaded successfully");
+    return _valid = true;
+}
+
+} // namespace mlgo
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h
new file mode 100644
index 0000000000..6a491c5503
--- /dev/null
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
+#define SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
+
+#include "src/runtime/CL/mlgo/Common.h"
+#include "src/runtime/CL/mlgo/HeuristicTree.h"
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <utility>
+namespace arm_compute
+{
+namespace mlgo
+{
+/** Query interface */
+struct Query
+{
+    std::string  ip_target; /**< The name of the IP target */
+    DataType     data_type; /**< Data type */
+    unsigned int m;         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+    unsigned int n;         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int k;         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int b;         /**< Batch size */
+};
+
+bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs);
+bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs);
+bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs);
+
+/** MLGOHeuristics for configuring GEMM kernels */
+class MLGOHeuristics
+{
+public:
+    /** Constructor */
+    MLGOHeuristics();
+    /** Default Destructor */
+    ~MLGOHeuristics() = default;
+    /** Prevent Copy Construct */
+    MLGOHeuristics(const MLGOHeuristics &) = delete;
+    /** Prevent Copy Assignment */
+    MLGOHeuristics &operator=(const MLGOHeuristics &) = delete;
+    /** Default Move Constructor */
+    MLGOHeuristics(MLGOHeuristics &&) = default;
+    /** Default Move Assignment */
+    MLGOHeuristics &operator=(MLGOHeuristics &&) = default;
+    /** Query the gemm type
+     *
+     * @param[in] query Query
+     *
+     * @return std::pair<bool, GEMMType>  signals if the query succeeded or failed
+     */
+    std::pair<bool, GEMMType> query_gemm_type(const Query &query) const;
+    /** Query the gemm configuration for native kernel
+     *
+     * @param[in] query Query
+     *
+     * @return std::pair<bool, GEMMConfigNative>   bool signals if the query succeeded or failed
+     */
+    std::pair<bool, GEMMConfigNative> query_gemm_config_native(const Query &query) const;
+    /** Query the gemm configuration for reshaped only rhs kernel
+     *
+     * @param[in] query Query
+     *
+     * @return std::pair<bool, GEMMConfigReshapedOnlyRHS>   bool signals if the query succeeded or failed
+     */
+    std::pair<bool, GEMMConfigReshapedOnlyRHS> query_gemm_config_reshaped_only_rhs(const Query &query) const;
+    /** Query the gemm configuration for reshaped kernel
+     *
+     * @param[in] query Query
+     *
+     * @return std::pair<bool, GEMMConfigReshaped>   bool signals if the query succeeded or failed
+     */
+    std::pair<bool, GEMMConfigReshaped> query_gemm_config_reshaped(const Query &query) const;
+    /** (Re)Load the heuristics from reading a dotmlgo file
+     *
+     * @param[in] filename Path to the dotmlgo file
+     *
+     * @return bool Signals if the reload succeeded or failed
+     */
+    bool reload_from_file(const std::string &filename);
+    /** (Re)Load the heuristics from reading an input stream
+     *
+     * @param[in] istream Istream containing mlgo heuristics
+     *
+     * @return bool Signals if the reload succeeded or failed
+     */
+    bool reload_from_stream(std::istream &istream);
+
+    /** Get the heuristic tree from tree id
+     *
+     * @param[in] id Tree id.
+     *
+     * @return HeuristicTree&
+     */
+    std::pair<bool, HeuristicTree *> get_heuristic_tree(HeuristicTree::TreeID id);
+    /** Add a heuristic tree
+     * @param t Heuristic tree to be added
+     */
+    bool add_heuristic_tree(HeuristicTree &&t);
+
+    /** Check the validity of the heuristic tree.
+     *
+     * @param id ID of the tree to be checked
+     *
+     * @return bool
+     */
+    bool check_heuristic_tree(HeuristicTree::TreeID id);
+
+    /** Check the overall validity of the heuristics.
+     * @return bool
+     */
+    bool check_all() const;
+
+private:
+    static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/
+
+private:
+    // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree
+    std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices;    /**< A mapping from TreeID to Index */
+    std::map<HeuristicTree::Index, HeuristicTree>         _trees;      /**< A mapping from Index to HeuristicTree */
+    std::map<HeuristicTree::TreeID, bool>                 _tree_valid; /**< Result cache of the tree validity checks */
+    bool                                                  _valid;      /**< Overall validity */
+};
+
+} // namespace mlgo
+} // namespace arm_compute
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp
new file mode 100644
index 0000000000..893daf2ed9
--- /dev/null
+++ b/src/runtime/CL/mlgo/MLGOParser.cpp
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/CL/mlgo/MLGOParser.h"
+
+#include "arm_compute/core/Log.h"
+
+#include "src/runtime/CL/mlgo/Utils.h"
+
+#include <sstream>
+
+#define CHECK(parser_expr, valid_var) \
+    (parser_expr);                    \
+    if (!valid_var)                   \
+        return;
+
+#define CHECK_DEFAULT(parser_expr, valid_var, default_val) \
+    (parser_expr);                                         \
+    if (!valid_var)                                        \
+        return default_val;
+
+#ifdef ARM_COMPUTE_LOGGING_ENABLED
+
+#define FAIL_WITH_MSG(valid_var, pos, msg)           \
+    std::stringstream ss;                            \
+    ss << "MLGOParser Error: " << pos << " " << msg; \
+    ARM_COMPUTE_LOG_INFO_MSG_CORE(ss.str().c_str()); \
+    valid_var = false;                               \
+    return;
+
+#define FAIL_WITH_MSG_DEFAULT(valid_var, default_val, pos, msg) \
+    std::stringstream ss;                                       \
+    ss << "MLGOParser Error: " << pos << " " << msg;            \
+    ARM_COMPUTE_LOG_INFO_MSG_CORE(ss.str().c_str());            \
+    valid_var = false;                                          \
+    return default_val;
+
+#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos();
+
+#else // ARM_COMPUTE_LOGGING_ENABLED
+
+#define FAIL_WITH_MSG(valid_var, pos, msg) \
+    valid_var = false;                     \
+    return;
+
+#define FAIL_WITH_MSG_DEFAULT(valid_var, default_val, pos, msg) \
+    valid_var = false;                                          \
+    return default_val;
+
+#define LOG_TOKEN_POS(tokens, pos_var)
+
+#endif // ARM_COMPUTE_LOGGING_ENABLED
+namespace
+{
+void ltrim(std::string &str)
+{
+    str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); }));
+}
+
+void rtrim(std::string &str)
+{
+    str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end());
+}
+
+void trim(std::string &str)
+{
+    ltrim(str);
+    rtrim(str);
+}
+} // namespace
+
+namespace arm_compute
+{
+namespace mlgo
+{
+namespace parser
+{
+enum class ComparatorType
+{
+    Enum,
+    Num,
+    Var
+};
+
+TokenStream::TokenStream(std::istream &s, const std::string &delims)
+    : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{}
+{
+    read();
+}
+
+TokenStream::operator bool() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
+    return !reached_end();
+}
+
+Token TokenStream::take()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
+    Token t = _tokens.front();
+    _tokens.pop_front();
+    if (_tokens.empty())
+    {
+        read();
+    }
+    return t;
+}
+Token TokenStream::peek(size_t i)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
+    ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead");
+    // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end
+    while (_istream && _tokens.size() <= i)
+    {
+        read();
+    }
+    size_t ind = std::min(i, _tokens.size() - 1);
+    return _tokens[ind];
+}
+
+void advance(CharPosition &pos, char ch)
+{
+    if (ch == '\n')
+    {
+        pos.ln += 1;
+        pos.col = 0;
+    }
+    else
+    {
+        pos.col += 1;
+    }
+}
+void rewind(CharPosition &pos)
+{
+    pos.col -= 1;
+}
+void TokenStream::read()
+{
+    char ch;
+    // Skip any leading space and delim characters
+    do
+    {
+        // Reached eof
+        if (!_istream.get(ch))
+        {
+            if (!reached_end())
+            {
+                _tokens.emplace_back(TokenType::End, "", _lookahead_pos);
+            }
+            return;
+        }
+        advance(_lookahead_pos, ch);
+    } while (std::isspace(ch) || is_delim(ch));
+    // Read chars until we hit a delim or eof
+    auto orig_pos = _lookahead_pos;
+    auto tok      = recognize_tok(ch);
+    rewind(orig_pos);
+    tok.pos = orig_pos;
+    // Trim leading and trailing white spaces
+    trim(tok.value);
+    _tokens.push_back(tok);
+}
+
+Token TokenStream::recognize_tok(char ch)
+{
+    if (ch == '[')
+    {
+        return Token{TokenType::L_List, "", _lookahead_pos};
+    }
+    else if (ch == ']')
+    {
+        return Token{TokenType::R_List, "", _lookahead_pos};
+    }
+    else if (ch == '.')
+    {
+        return float_after_dp_st(std::string{ch});
+    }
+    else if (std::isdigit(ch))
+    {
+        return num_st(std::string{ch});
+    }
+    else
+    {
+        return text_st(std::string{ch});
+    }
+}
+
+Token TokenStream::num_st(std::string value)
+{
+    char ch{};
+    while (_istream.get(ch))
+    {
+        advance(_lookahead_pos, ch);
+        if (ch == '.')
+        {
+            return float_after_dp_st(value + ch);
+        }
+        else if (!std::isdigit(ch))
+        {
+            if (!is_delim(ch) && !std::isspace(ch))
+            {
+                rewind(_lookahead_pos);
+                _istream.unget();
+            }
+            break;
+        }
+        value += ch;
+    }
+    return Token{TokenType::Int, value, _lookahead_pos};
+}
+
+Token TokenStream::float_after_dp_st(std::string value)
+{
+    char ch{};
+    while (_istream.get(ch))
+    {
+        advance(_lookahead_pos, ch);
+        if (!std::isdigit(ch))
+        {
+            if (!is_delim(ch) && !std::isspace(ch))
+            {
+                rewind(_lookahead_pos);
+                _istream.unget();
+            }
+            break;
+        }
+        value += ch;
+    }
+    return Token{TokenType::Float, value, _lookahead_pos};
+}
+
+Token TokenStream::text_st(std::string value)
+{
+    char ch{};
+    while (_istream.get(ch))
+    {
+        advance(_lookahead_pos, ch);
+        if (is_delim(ch))
+        {
+            break;
+        }
+        if (ch == '[' || ch == ']')
+        {
+            rewind(_lookahead_pos);
+            _istream.unget();
+            break;
+        }
+        value += ch;
+    }
+    return Token{TokenType::Text, value, _lookahead_pos};
+}
+
+bool TokenStream::reached_end() const
+{
+    return _tokens.size() == 1 && _tokens.front().type == TokenType::End;
+}
+
+bool TokenStream::is_delim(char ch) const
+{
+    return _delims.find(ch) != std::string::npos;
+}
+
+void end(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    auto tok = in.take();
+    if (tok.type != TokenType::End)
+    {
+        FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream");
+    }
+}
+
+bool bool_val(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    auto tok = in.take();
+    if (tok.type != TokenType::Int)
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token");
+    }
+    bool val{};
+    std::stringstream(tok.value) >> val;
+    return val;
+}
+
+int int_val(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    auto tok = in.take();
+    if (tok.type != TokenType::Int)
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token");
+    }
+    int val{};
+    std::stringstream(tok.value) >> val;
+    return val;
+}
+
+unsigned int uint_val(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    int val = CHECK_DEFAULT(int_val(in, valid), valid, 0);
+    if (val < 0)
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token");
+    }
+    return static_cast<unsigned int>(val);
+}
+
+float float_val(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    auto tok = in.take();
+    if (tok.type != TokenType::Float)
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token");
+    }
+    float val{};
+    std::stringstream(tok.value) >> val;
+    return val;
+}
+
+std::string text_val(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    auto tok = in.take();
+    if (tok.type != TokenType::Text || tok.value.empty())
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token");
+    }
+    return tok.value;
+}
+
+bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
+{
+    auto tok = in.peek();
+    if (tok.type == TokenType::Text && tok.value == c_str)
+    {
+        if (take)
+        {
+            in.take();
+        }
+        return true;
+    }
+    return false;
+}
+
+void expect_text(TokenStream &in, const std::string &str, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (!accept_text(in, str))
+    {
+        FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str);
+    }
+}
+
+bool accept_l_list(TokenStream &in)
+{
+    auto tok = in.peek();
+    if (tok.type == TokenType::L_List)
+    {
+        in.take();
+        return true;
+    }
+    return false;
+}
+
+void expect_l_list(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (!accept_l_list(in))
+    {
+        FAIL_WITH_MSG(valid, pos, "Expect '['");
+    }
+}
+
+bool accept_r_list(TokenStream &in)
+{
+    auto tok = in.peek();
+    if (tok.type == TokenType::R_List)
+    {
+        in.take();
+        return true;
+    }
+    return false;
+}
+
+void expect_r_list(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (!accept_r_list(in))
+    {
+        FAIL_WITH_MSG(valid, pos, "Expect ']'");
+    }
+}
+
+ConditionalOp conditional_op(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "<="))
+    {
+        return ConditionalOp::LE;
+    }
+    else if (accept_text(in, ">="))
+    {
+        return ConditionalOp::GE;
+    }
+    else if (accept_text(in, "=="))
+    {
+        return ConditionalOp::EQ;
+    }
+    else if (accept_text(in, "<"))
+    {
+        return ConditionalOp::LT;
+    }
+    else if (accept_text(in, ">"))
+    {
+        return ConditionalOp::GT;
+    }
+    else
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, ConditionalOp::EQ, pos, "Expect conditional op");
+    }
+}
+
+void gemm_version(TokenStream &in, bool &valid)
+{
+    CHECK(expect_text(in, "gemm-version", valid), valid);
+    CHECK(expect_l_list(in, valid), valid);
+    CHECK(uint_val(in, valid), valid);
+    CHECK(uint_val(in, valid), valid);
+    CHECK(uint_val(in, valid), valid);
+    CHECK(expect_r_list(in, valid), valid);
+}
+
+void ip_type(TokenStream &in, bool &valid)
+{
+    CHECK(expect_text(in, "ip-type", valid), valid);
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "gpu"))
+    {
+        ;
+    }
+    else if (accept_text(in, "cpu"))
+    {
+        ;
+    }
+    else
+    {
+        FAIL_WITH_MSG(valid, pos, "Expect ip type");
+    }
+}
+
+void header(TokenStream &in, bool &valid)
+{
+    CHECK(expect_text(in, "<header>", valid), valid);
+    CHECK(gemm_version(in, valid), valid);
+    CHECK(ip_type(in, valid), valid);
+    CHECK(expect_text(in, "</header>", valid), valid);
+}
+
+DataType data_type(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "f16"))
+    {
+        return DataType::F16;
+    }
+    else if (accept_text(in, "f32"))
+    {
+        return DataType::F32;
+    }
+    else if (accept_text(in, "qasymm8"))
+    {
+        return DataType::QASYMM8;
+    }
+    else
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, DataType::QASYMM8, pos, "Expect data type");
+    }
+}
+
+ComparatorType comparator_type(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "var"))
+    {
+        return ComparatorType::Var;
+    }
+    else if (accept_text(in, "num"))
+    {
+        return ComparatorType::Num;
+    }
+    else if (accept_text(in, "enum"))
+    {
+        return ComparatorType::Enum;
+    }
+    else
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, ComparatorType::Num, pos, "Expect comparator type");
+    }
+}
+
+HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "gemm-type", take))
+    {
+        return HeuristicType::GEMM_Type;
+    }
+    else if (accept_text(in, "gemm-config-native", take))
+    {
+        return HeuristicType::GEMM_Config_Native;
+    }
+    else if (accept_text(in, "gemm-config-reshaped-only-rhs", take))
+    {
+        return HeuristicType::GEMM_Config_Reshaped_Only_RHS;
+    }
+    else if (accept_text(in, "gemm-config-reshaped", take))
+    {
+        return HeuristicType::GEMM_Config_Reshaped;
+    }
+    else
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, HeuristicType::GEMM_Config_Reshaped, pos, "Expect heuristic type");
+    }
+}
+
+void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    auto ht = CHECK(heuristic_type(in, valid, false), valid);
+    if (ht != expected_ht)
+    {
+        FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type");
+    }
+    CHECK(heuristic_type(in, valid, true), valid);
+}
+
+GEMMType gemm_type(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "native"))
+    {
+        return GEMMType::NATIVE;
+    }
+    else if (accept_text(in, "reshaped-only-rhs"))
+    {
+        return GEMMType::RESHAPED_ONLY_RHS;
+    }
+    else if (accept_text(in, "reshaped"))
+    {
+        return GEMMType::RESHAPED;
+    }
+    else
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, GEMMType::RESHAPED_ONLY_RHS, pos, "Expect gemm type");
+    }
+}
+
+GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid)
+{
+    const auto invalid_val = GEMMConfigNative{};
+    CHECK_DEFAULT(expect_l_list(in, valid), valid, invalid_val);
+    const auto m0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
+    return GEMMConfigNative{m0, n0, k0};
+}
+
+GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid)
+{
+    const auto invalid_val = GEMMConfigReshapedOnlyRHS{};
+    CHECK_DEFAULT(expect_l_list(in, valid), valid, invalid_val);
+    const auto m0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto h0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto ir = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
+    const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
+    const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
+    CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
+    return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex};
+}
+
+GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
+{
+    const auto invalid_val = GEMMConfigReshaped{};
+    CHECK_DEFAULT(expect_l_list(in, valid), valid, invalid_val);
+    const auto m0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto v0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto h0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
+    const auto il = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
+    const auto ir = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
+    const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
+    const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
+    CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
+    return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex};
+}
+
+void gpu_priority(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "best-performance"))
+    {
+        ;
+    }
+    else if (accept_text(in, "best-memory-usage"))
+    {
+        ;
+    }
+    else
+    {
+        FAIL_WITH_MSG(valid, pos, "Expect gpu priority");
+    }
+}
+
+void gpu_behavior(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    if (accept_text(in, "static"))
+    {
+        ;
+    }
+    else if (accept_text(in, "dynamic"))
+    {
+        ;
+    }
+    else
+    {
+        FAIL_WITH_MSG(valid, pos, "Expect ip type");
+    }
+}
+
+void free_vars(TokenStream &in, bool &valid)
+{
+    CHECK(expect_l_list(in, valid), valid);
+    while (!accept_r_list(in))
+    {
+        CHECK(text_val(in, valid), valid);
+    }
+}
+
+void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid)
+{
+    const auto id = CHECK(uint_val(in, valid), valid);
+    const auto ip = CHECK(text_val(in, valid), valid);
+    CHECK(uint_val(in, valid), valid); // Num cores
+    const auto dt = CHECK(data_type(in, valid), valid);
+    CHECK(gpu_priority(in, valid), valid);
+    CHECK(gpu_behavior(in, valid), valid);
+    const auto ht = CHECK(heuristic_type(in, valid), valid);
+    CHECK(free_vars(in, valid), valid);
+    HeuristicTree t(id, ht, ip, dt);
+    valid = CHECK(h.add_heuristic_tree(std::move(t)), valid);
+}
+
+void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid)
+{
+    CHECK(expect_text(in, "<heuristics-table>", valid), valid);
+    while (!accept_text(in, "</heuristics-table>"))
+    {
+        CHECK(heuristics_table_entry(in, h, valid), valid);
+    }
+}
+
+Condition condition(TokenStream &in, bool &valid)
+{
+    LOG_TOKEN_POS(in, pos);
+    // NOTE: Only simplified Conditions are accepted, which means the lhs comparator type is fixed to Var and that of
+    // the rhs is fixed to Num (float)
+    const auto invalid_val = Condition{};
+    const auto l_t         = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val);
+    const auto l_v         = CHECK_DEFAULT(text_val(in, valid), valid, invalid_val);
+    const auto c_o         = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val);
+    const auto r_t         = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val);
+    const auto r_v         = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val);
+    if (l_t != ComparatorType::Var || r_t != ComparatorType::Num)
+    {
+        FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos,
+                              "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
+    }
+    return Condition{l_v, c_o, r_v};
+}
+
+void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
+{
+    CHECK(expect_text(in, "<heuristic", valid), valid);
+    const auto tree_id = CHECK(uint_val(in, valid), valid);
+    CHECK(expect_text(in, ">", valid), valid);
+    HeuristicTree *t                     = nullptr;
+    std::tie(valid, t)                   = CHECK(h.get_heuristic_tree(tree_id), valid);
+    const HeuristicType t_heuristic_type = std::get<0>(t->index());
+    while (!accept_text(in, "</heuristic>"))
+    {
+        LOG_TOKEN_POS(in, pos);
+        if (accept_text(in, "b"))
+        {
+            // Branch node
+            const auto id   = CHECK(uint_val(in, valid), valid);
+            const auto cond = CHECK(condition(in, valid), valid);
+            const auto t_id = CHECK(uint_val(in, valid), valid);
+            const auto f_id = CHECK(uint_val(in, valid), valid);
+            valid           = CHECK(t->add_branch(id, cond, t_id, f_id), valid);
+        }
+        else if (accept_text(in, "l"))
+        {
+            // Leaf node
+            const auto id = CHECK(uint_val(in, valid), valid);
+            // NOTE: Heuristic type within each tree appears to be redundant (same information can be obtained from the
+            // heuristic table). For now it remains as a step for validation.
+            LOG_TOKEN_POS(in, pos);
+            CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid);
+            switch (t_heuristic_type)
+            {
+                case HeuristicType::GEMM_Type:
+                {
+                    const auto g_type = CHECK(gemm_type(in, valid), valid);
+                    valid             = CHECK(t->add_leaf(id, g_type), valid);
+                    break;
+                }
+                case HeuristicType::GEMM_Config_Native:
+                {
+                    const auto g_c = CHECK(gemm_config_native(in, valid), valid);
+                    valid          = CHECK(t->add_leaf(id, g_c), valid);
+                    break;
+                }
+                case HeuristicType::GEMM_Config_Reshaped_Only_RHS:
+                {
+                    const auto g_c = CHECK(gemm_config_reshaped_only_rhs(in, valid), valid);
+                    valid          = CHECK(t->add_leaf(id, g_c), valid);
+                    break;
+                }
+                case HeuristicType::GEMM_Config_Reshaped:
+                {
+                    const auto g_c = CHECK(gemm_config_reshaped(in, valid), valid);
+                    valid          = CHECK(t->add_leaf(id, g_c), valid);
+                    break;
+                }
+                default:
+                {
+                    FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type");
+                }
+            }
+        }
+        else
+        {
+            FAIL_WITH_MSG(valid, pos, "Expect tree node type");
+        }
+    }
+    // Perform semantic checks in the middle of parsing so that it can fail fast should there be any invalidities
+    valid = CHECK(h.check_heuristic_tree(tree_id), valid);
+}
+
+MLGOHeuristics mlgo(TokenStream &in, bool &valid)
+{
+    MLGOHeuristics h;
+    CHECK_DEFAULT(header(in, valid), valid, h);
+    CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h);
+    while (accept_text(in, "<heuristic", false))
+    {
+        CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h);
+    }
+    CHECK_DEFAULT(end(in, valid), valid, h);
+    valid = CHECK_DEFAULT(h.check_all(), valid, h);
+    return h;
+}
+
+std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in)
+{
+    auto tokens = TokenStream(in);
+    bool valid  = true;
+    auto h      = mlgo(tokens, valid);
+    return std::make_pair(std::move(valid), std::move(h));
+}
+} // namespace parser
+} // namespace mlgo
+} // namespace arm_compute
+
+#undef CHECK
+#undef CHECK_DEFAULT
+#undef FAIL_WITH_MSG
+#undef FAIL_WITH_MSG_DEFAULT
diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h
new file mode 100644
index 0000000000..cffce8d6a1
--- /dev/null
+++ b/src/runtime/CL/mlgo/MLGOParser.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
+#define SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
+
+#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
+
+#include <deque>
+#include <istream>
+#include <string>
+#include <utility>
+
+/** A DotMLGO file parser (LL(k) parser)
+ *
+ * The grammar of DotMLGO is defined as the following ENBF:
+ *
+ * delim = "," | "\n"; // Note that delimiters are omitted from the definition below
+ *
+ * mlgo = header, heuristics-table, {heuristic-tree};
+ *
+ * header = "<header>", gemm-version, ip-type, "</header>";
+ * gemm-version = "gemm-version",  "[", int,  int,  int, "]";
+ * ip-type = "ip-type",  ("gpu" | "cpu");
+ *
+ * heiristics-table = "<heuristics-table>", {heuristics-table-entry}, "</heuristics-table>";
+ * heuristics-table-entry = entry-id,  ip-name,  num-cores, data-type,  gpu-priority,  gpu-behavior,  heuristic-type,  free-vars;
+ * entry-id = int;
+ * ip-name = char-sequence;
+ * num-cores = int;
+ * data-type = "f32" | "f16" | "qasymm8";
+ * gpu-priority = "best-performance" | "best-memory-usage";
+ * gpu-behavior = "static" | "dynamic";
+ * heuristic-type = "gemm-type" | "gemm-config-native" | "gemm-config-reshaped-only-rhs" | "gemm-config-reshaped";
+ * free-vars = "[", {char-sequence}, "]";
+ *
+ * heuristic-tree = "<heuristic",  entry-id, ">", {tree-node}, "</heuristic>";
+ * tree-node = branch-node | leaf-node;
+ * branch-node = "b",  entry-id,  lhs-type,  lhs-value,  conditional-op,  rhs-type,  rhs-value,  true-node,  false-node;
+ * lhs-type = comparator-type;
+ * lhs-value = comparator-value;
+ * rhs-type = comparator-type;
+ * rhs-value = comparator-value;
+ * comparator-type = "var" | "num" | "enum";
+ * comparator-value = char-sequence | float;
+ * conditional-op = "<" | "<=" | "==" | ">=" | ">";
+ * true-node = entry-id;
+ * false-node = entry-id;
+ * leaf-node = "l",  entry-id,  heuristic-type,  leaf-value;
+ * leaf-value = gemm-type | gemm-config-native | gemm-config-reshaped-only-rhs | gemm-config-reshaped
+ * gemm-type = "native" | "reshaped-only-rhs" | "reshaped";
+ * gemm-config-native = "[", int, int, int, "]";
+ * gemm-config-reshaped-only-rhs = "[", int, int, int, int, bool, bool, bool, "]";
+ * gemm-config-reshaped = "[", int, int, int, int, int, bool, bool, bool, bool, "]";
+ */
+
+namespace arm_compute
+{
+namespace mlgo
+{
+namespace parser
+{
+/** Type of Token */
+enum class TokenType
+{
+    L_List = '[', /**< List open */
+    R_List = ']', /**< List close */
+    Int,          /**< Integral */
+    Float,        /**< Floating */
+    Text,         /**< Text/String */
+    End,          /**< End of stream */
+};
+
+struct CharPosition
+{
+    bool operator==(const CharPosition &other) const
+    {
+        return ln == other.ln && col == other.col;
+    }
+
+    size_t ln{0};
+    size_t col{0};
+};
+
+/** Token */
+struct Token
+{
+    Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos}
+    {
+    }
+
+    bool operator==(const Token &other) const
+    {
+        return type == other.type && value == other.value && pos == other.pos;
+    }
+
+    TokenType    type;  /**< Token type */
+    std::string  value; /**< Token value */
+    CharPosition pos;
+};
+
+/** A stream of token */
+class TokenStream
+{
+    // NOTE: _tokens is never empty. The end of token stream is signalled by the End Token
+public:
+    static constexpr size_t max_look_ahead = 10;
+
+public:
+    /** Constructor
+     *
+     * @param[in] s      Input stream
+     * @param[in] delims Delimiter characters packed in a string. Each char from the string can be used as a delim on its own
+     */
+    TokenStream(std::istream &s, const std::string &delims = ",\n");
+
+    /** Check if there're more (non-End) Tokens
+     * @return true  If there are more tokens
+     * @return false If reached end of stream (only End token)
+     */
+    explicit operator bool() const;
+
+    /** Get and pop off the current token
+     *
+     * @return Token
+     */
+    Token take();
+
+    /** Peek the next ith token
+     *
+     * @param[in] i The next ith token. i < @ref max_look_ahead.
+     *
+     * @return Token
+     */
+    Token peek(size_t i = 0);
+
+    /** Get the position of the current token
+     *
+     * @return CharPosition
+     */
+    CharPosition current_pos() const
+    {
+        return _tokens.front().pos;
+    }
+
+private:
+    void read();
+
+    Token recognize_tok(char ch);
+
+    Token num_st(std::string value = "");
+
+    Token float_after_dp_st(std::string value = "");
+
+    Token text_st(std::string value = "");
+
+    bool reached_end() const;
+
+    bool is_delim(char ch) const;
+
+    std::string       _delims;
+    std::istream     &_istream;
+    std::deque<Token> _tokens;
+    CharPosition      _lookahead_pos;
+};
+
+/** Parse and construct a @ref MLGOHeuristics from input stream
+ *
+ * @param[in] in Input stream
+ *
+ * @return MLGOHeuristics
+ */
+std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in);
+
+} // namespace parser
+} // namespace mlgo
+} // namespace arm_compute
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp
new file mode 100644
index 0000000000..c7e0100b3c
--- /dev/null
+++ b/src/runtime/CL/mlgo/Utils.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/CL/mlgo/Utils.h"
+
+#include <sstream>
+
+namespace arm_compute
+{
+namespace mlgo
+{
+namespace
+{
+template <typename T>
+inline std::string to_str(const T &val)
+{
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+} // namespace
+
+std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config)
+{
+    return os << "Native:{"
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "}";
+}
+std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config)
+{
+    return os << "ReshapedOnlyRHS:{"
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
+}
+std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config)
+{
+    return os << "Reshaped:{"
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "v0: " << config.v0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_lhs: " << config.interleave_lhs << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
+}
+std::ostream &operator<<(std::ostream &os, HeuristicType ht)
+{
+    switch (ht)
+    {
+        case HeuristicType::GEMM_Type:
+        {
+            os << "GEMM_Type";
+            break;
+        }
+        case HeuristicType::GEMM_Config_Reshaped_Only_RHS:
+        {
+            os << "GEMM_Config_Reshaped_Only_RHS";
+            break;
+        }
+        case HeuristicType::GEMM_Config_Reshaped:
+        {
+            os << "GEMM_Config_Reshaped";
+            break;
+        }
+        default:
+        {
+            os << "Unknown";
+            break;
+        }
+    }
+    return os;
+}
+std::ostream &operator<<(std::ostream &os, DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::F32:
+        {
+            os << "F32";
+            break;
+        }
+        case DataType::F16:
+        {
+            os << "F16";
+            break;
+        }
+        case DataType::QASYMM8:
+        {
+            os << "QASYMM8";
+            break;
+        }
+        default:
+        {
+            os << "Unknown";
+            break;
+        }
+    }
+    return os;
+}
+std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index)
+{
+    HeuristicType ht;
+    std::string   ip;
+    DataType      dt;
+    std::tie(ht, ip, dt) = index;
+    os << "Index(";
+    os << "HeuristicType=" << ht << ",";
+    os << "IP=" << ip << ",";
+    os << "DataType=" << dt;
+    os << ")";
+    return os;
+}
+std::ostream &operator<<(std::ostream &os, const Query &query)
+{
+    os << "Query(";
+    os << "IP=" << query.ip_target << ",";
+    os << "DataType=" << query.data_type << ",";
+    os << "M=" << query.m << ",";
+    os << "N=" << query.n << ",";
+    os << "K=" << query.k << ",";
+    os << "B=" << query.b << ")";
+    return os;
+}
+
+std::string to_string(const GEMMConfigNative &config)
+{
+    return to_str(config);
+}
+
+std::string to_string(const GEMMConfigReshapedOnlyRHS &config)
+{
+    return to_str(config);
+}
+
+std::string to_string(const GEMMConfigReshaped &config)
+{
+    return to_str(config);
+}
+
+std::string to_string(const Query &query)
+{
+    return to_str(query);
+}
+
+namespace parser
+{
+std::ostream &operator<<(std::ostream &os, const CharPosition &pos)
+{
+    os << "(Ln: " << pos.ln << ", Col: " << pos.col << ")";
+    return os;
+}
+} // namespace parser
+
+} // namespace mlgo
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h
new file mode 100644
index 0000000000..73b537f476
--- /dev/null
+++ b/src/runtime/CL/mlgo/Utils.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_CL_MLGO_UTILS_H
+#define SRC_RUNTIME_CL_MLGO_UTILS_H
+
+#include "src/runtime/CL/mlgo/Common.h"
+#include "src/runtime/CL/mlgo/HeuristicTree.h"
+#include "src/runtime/CL/mlgo/MLGOHeuristics.h"
+#include "src/runtime/CL/mlgo/MLGOParser.h"
+
+#include <ostream>
+#include <string>
+
+namespace arm_compute
+{
+namespace mlgo
+{
+std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config);
+std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config);
+std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config);
+std::ostream &operator<<(std::ostream &os, HeuristicType ht);
+std::ostream &operator<<(std::ostream &os, DataType dt);
+std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index);
+std::ostream &operator<<(std::ostream &os, const Query &query);
+std::string   to_string(const GEMMConfigNative &config);
+std::string   to_string(const GEMMConfigReshapedOnlyRHS &config);
+std::string   to_string(const GEMMConfigReshaped &config);
+std::string   to_string(const Query &query);
+namespace parser
+{
+std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
+} // namespace parser
+} // namespace mlgo
+} // namespace arm_compute
+
+#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
deleted file mode 100644
index 5b23baaed3..0000000000
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-
-namespace arm_compute
-{
-namespace tuners
-{
-namespace
-{
-/** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target
- *
- * @param[in] k Kernels to tune
- */
-void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k)
-{
-    cl::NDRange lws_hint = k.lws_hint();
-
-    const GPUTarget    gpu_target    = k.get_target();
-    const DataType     dt            = k._input->info()->data_type();
-    const TensorShape  weights_shape = k._weights->info()->tensor_shape();
-    const TensorShape  inputs_shape  = k._input->info()->tensor_shape();
-    const size_t       kernel_size   = weights_shape.x();
-    const unsigned int stride_x      = k._conv_stride_x;
-    const unsigned int stride_y      = k._conv_stride_y;
-
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32))
-    {
-        // Through extensive experimentation with over 30 representative tensor
-        // shapes, we found a small number of local work size configurations
-        // that result in nearly optimal execution times. Selecting the right
-        // lws for a given shape, however, required a complex decision tree,
-        // until we constructed a simple feature as described below.
-        //
-        // We started from the number of multiply-accumulate operations for a
-        // convolution layer, which is equal to the product of the input
-        // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
-        // this resulted in ties between distinct shapes that required distinct
-        // lws configurations. Replacing the width of the input with the kernel
-        // size, however, resulted in nearly optimal predictions. We use underscores
-        // in variable names to indicate when they are intentionally misleading.
-        const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2];
-        const size_t product_of_input_dimensions_  = inputs_shape[0] * inputs_shape[1] * inputs_shape[2];
-        const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
-
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                if(mega_ops_ < 1.f)
-                {
-                    lws_hint = cl::NDRange(1, 1, 8);
-                }
-                else if(mega_ops_ < 7.f)
-                {
-                    lws_hint = cl::NDRange(1, 1, 4);
-                }
-                else
-                {
-                    lws_hint = cl::NDRange(1, 1, 2);
-                }
-                break;
-            }
-            case 3:
-            {
-                if(mega_ops_ < 1.f)
-                {
-                    lws_hint = cl::NDRange(1, 1, 8);
-                }
-                else if(mega_ops_ < 13.f)
-                {
-                    lws_hint = cl::NDRange(2, 1, 4);
-                }
-                else if(mega_ops_ < 50.f)
-                {
-                    lws_hint = cl::NDRange(3, 1, 4);
-                }
-                else
-                {
-                    lws_hint = cl::NDRange(2, 1, 6);
-                }
-                break;
-            }
-            case 5:
-            {
-                if(mega_ops_ < 2.f || mega_ops_ > 80.f)
-                {
-                    lws_hint = cl::NDRange(2, 1, 4);
-                }
-                else
-                {
-                    lws_hint = cl::NDRange(2, 1, 8);
-                }
-                break;
-            }
-            default:
-                break;
-        }
-        k.set_lws_hint(lws_hint);
-    }
-}
-
-void tune_col2im_kernel(CLCol2ImKernel &k)
-{
-    cl::NDRange     lws_hint   = k.lws_hint();
-    const GPUTarget gpu_target = k.get_target();
-
-    // Configure the local work size for Bifrost with a value obtained
-    // via exhaustive autotuning over 30 representative tensor shapes.
-    if(gpu_target_is_in(gpu_target,
-                        GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                        GPUTarget::G52, GPUTarget::G52LIT))
-    {
-        if((k._convolved_dims.width == 7) || (k._convolved_dims.width == 14))
-        {
-            lws_hint = cl::NDRange(1, 7, 1);
-        }
-        else
-        {
-            lws_hint = cl::NDRange(1, 8, 1);
-        }
-    }
-
-    k.set_lws_hint(lws_hint);
-}
-
-void tune_im2col_kernel(CLIm2ColKernel &k)
-{
-    cl::NDRange     lws_hint   = k.lws_hint();
-    const GPUTarget gpu_target = k.get_target();
-
-    // Local work size optimized for the 11x11 AlexNet convolution on Bifrost.
-    if(gpu_target_is_in(gpu_target,
-                        GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                        GPUTarget::G52, GPUTarget::G52LIT)
-       && k._kernel_dims.width == 11)
-    {
-        const bool is_square_kernel = (k._kernel_dims.width == k._kernel_dims.height);
-        if(!is_square_kernel && k._kernel_dims.width > 1 && !k._conv_info.has_padding())
-        {
-            lws_hint = cl::NDRange(1, 1, 1);
-        }
-    }
-    k.set_lws_hint(lws_hint);
-}
-
-void tune_gemv_kernel(CLGEMMMatrixVectorMultiplyKernel &k)
-{
-    cl::NDRange     lws_hint   = k.lws_hint();
-    const GPUTarget gpu_target = k.get_target();
-
-    // Configure the local work size for Bifrost with a value obtained
-    // via exhaustive autotuning for the MobileNets tensor shapes.
-    if(gpu_target_is_in(gpu_target,
-                        GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                        GPUTarget::G52, GPUTarget::G52LIT))
-    {
-        lws_hint = cl::NDRange(1, 1, 1);
-    }
-
-    k.set_lws_hint(lws_hint);
-}
-
-void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k)
-{
-    cl::NDRange     lws_hint   = k.lws_hint();
-    const GPUTarget gpu_target = k.get_target();
-
-    // Configure LWS hint
-    switch(gpu_target)
-    {
-        case GPUTarget::G71:
-        case GPUTarget::G72:
-        case GPUTarget::G51:
-        case GPUTarget::G51BIG:
-        case GPUTarget::G51LIT:
-        case GPUTarget::G52:
-        case GPUTarget::G52LIT:
-        case GPUTarget::G76:
-            if(k._input1->info()->dimension(1) == 24)
-            {
-                // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
-                lws_hint = cl::NDRange(2, 2);
-            }
-            else if(k._output->info()->dimension(1) == 196)
-            {
-                lws_hint = cl::NDRange(1, 7);
-            }
-            else
-            {
-                lws_hint = cl::NDRange(8, 8);
-            }
-            break;
-        default:
-            lws_hint = cl::NullRange;
-    }
-
-    k.set_lws_hint(lws_hint);
-}
-
-void tune_pooling_kernel(CLPoolingLayerKernel &k)
-{
-    cl::NDRange     lws_hint   = k.lws_hint();
-    const GPUTarget gpu_target = k.get_target();
-
-    // Configure the local work size (hint) from the first two dimensions of the global work size.
-    // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
-    // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
-    // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
-    if(k._input->info()->data_layout() == DataLayout::NCHW)
-    {
-        if(gpu_target_is_in(gpu_target,
-                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                            GPUTarget::G52, GPUTarget::G52LIT))
-        {
-            cl::NDRange gws = ICLKernel::gws_from_window(k.window());
-            lws_hint        = cl::NDRange(gws[0], gws[1], 1);
-        }
-    }
-
-    k.set_lws_hint(lws_hint);
-}
-
-void tune_scale_kernel(CLScaleKernel &k)
-{
-    cl::NDRange               lws_hint      = k.lws_hint();
-    const GPUTarget           gpu_target    = k.get_target();
-    const DataType            dt            = k.input()->info()->data_type();
-    const InterpolationPolicy interpolation = k._interpolationPolicy;
-
-    // Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32.
-    // The value are obtained via exhaustive autotuning.
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (dt == DataType::F32) && (interpolation == InterpolationPolicy::BILINEAR))
-    {
-        auto dim_0 = k.output()->info()->dimension(0);
-        if(dim_0 == 480)
-        {
-            lws_hint = cl::NDRange(2, 1);
-        }
-        else if(dim_0 == 3120)
-        {
-            lws_hint = cl::NDRange(2, 8);
-        }
-        else if(dim_0 == 4160)
-        {
-            lws_hint = cl::NDRange(4, 8);
-        }
-        k.set_lws_hint(lws_hint);
-    }
-}
-} // namespace
-
-void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
-{
-    if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
-    {
-        tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel));
-    }
-    else if(dynamic_cast<CLCol2ImKernel *>(&kernel) != nullptr)
-    {
-        tune_col2im_kernel(*utils::cast::polymorphic_downcast<CLCol2ImKernel *>(&kernel));
-    }
-    else if(dynamic_cast<CLIm2ColKernel *>(&kernel) != nullptr)
-    {
-        tune_im2col_kernel(*utils::cast::polymorphic_downcast<CLIm2ColKernel *>(&kernel));
-    }
-    else if(dynamic_cast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel) != nullptr)
-    {
-        tune_gemv_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel));
-    }
-    else if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr)
-    {
-        tune_gemm_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel *>(&kernel));
-    }
-    else if(dynamic_cast<CLPoolingLayerKernel *>(&kernel) != nullptr)
-    {
-        tune_pooling_kernel(*utils::cast::polymorphic_downcast<CLPoolingLayerKernel *>(&kernel));
-    }
-    else if(dynamic_cast<CLScaleKernel *>(&kernel) != nullptr)
-    {
-        tune_scale_kernel(*utils::cast::polymorphic_downcast<CLScaleKernel *>(&kernel));
-    }
-}
-
-void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
-{
-    ARM_COMPUTE_UNUSED(kernel);
-}
-} // namespace tuners
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/CL/tuners/CLLWSList.cpp b/src/runtime/CL/tuners/CLLWSList.cpp
deleted file mode 100644
index 30fd558ef3..0000000000
--- a/src/runtime/CL/tuners/CLLWSList.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
-
-namespace arm_compute
-{
-namespace cl_tuner
-{
-size_t CLLWSList::size()
-{
-    return search_space_shape.total_size();
-}
-
-cl::NDRange CLLWSListExhaustive::operator[](size_t index)
-{
-    ARM_COMPUTE_ERROR_ON(index >= size());
-    auto coords = index2coords(search_space_shape, index);
-    return cl::NDRange{ coords[0] + 1U, coords[1] + 1U, coords[2] + 1U };
-}
-
-CLLWSListExhaustive::CLLWSListExhaustive(const cl::NDRange &gws)
-{
-    ARM_COMPUTE_UNUSED(gws);
-    search_space_shape = TensorShape(max_lws_supported_x,
-                                     max_lws_supported_y,
-                                     max_lws_supported_z);
-}
-
-cl::NDRange CLLWSListNormal::operator[](size_t index)
-{
-    ARM_COMPUTE_ERROR_ON(index >= size());
-    auto coords = index2coords(search_space_shape, index);
-    return cl::NDRange{ _lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]] };
-}
-
-CLLWSListNormal::CLLWSListNormal(const cl::NDRange &gws)
-{
-    auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
-    auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
-    auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
-
-    // Initialize the LWS values to test
-    initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
-    initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
-    initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
-
-    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
-}
-
-void CLLWSListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
-{
-    lws.push_back(1);
-
-    for(unsigned int i = 2; i <= lws_max; ++i)
-    {
-        // Power of two condition
-        const bool is_power_of_two = (i & (i - 1)) == 0;
-
-        // Condition for the module accordingly with the mod_let_one flag
-        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
-
-        if(mod_cond || is_power_of_two)
-        {
-            lws.push_back(i);
-        }
-    }
-}
-
-CLLWSListRapid::CLLWSListRapid(const cl::NDRange &gws)
-{
-    auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
-    auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
-    auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
-
-    // Initialize the LWS values to test
-    initialize_lws_values(_lws_x, lws_x_max);
-    initialize_lws_values(_lws_y, lws_y_max);
-    initialize_lws_values(_lws_z, lws_z_max);
-
-    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
-}
-
-void CLLWSListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
-{
-    lws.push_back(1);
-
-    for(unsigned int i = 2; i <= lws_max; i *= 4)
-    {
-        lws.push_back(i);
-    }
-}
-} // namespace cl_tuner
-} // namespace arm_compute
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
new file mode 100644
index 0000000000..5e3907f1ea
--- /dev/null
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
+
+namespace arm_compute
+{
+namespace cl_tuner
+{
+constexpr unsigned int max_lws_supported_x{64u};
+constexpr unsigned int max_lws_supported_y{32u};
+constexpr unsigned int max_lws_supported_z{32u};
+
+/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */
+class CLTuningParametersList : public ICLTuningParametersList
+{
+protected:
+    /* Shape of 4-D search space */
+    TensorShape               search_space_shape{0, 0, 0, 0};
+    std::vector<unsigned int> _lws_x{0};
+    std::vector<unsigned int> _lws_y{0};
+    std::vector<unsigned int> _lws_z{0};
+    std::vector<int>          _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units.
+                                             The value is in the range [-31,+31].
+                                             When 0, the runtime-selected wbs used is unmodified. */
+
+    /** Constructor */
+    CLTuningParametersList() = default;
+    /** Copy Constructor */
+    CLTuningParametersList(const CLTuningParametersList &) = default;
+    /** Move Constructor */
+    CLTuningParametersList(CLTuningParametersList &&) noexcept(true) = default;
+    /** Assignment */
+    CLTuningParametersList &operator=(const CLTuningParametersList &) = default;
+    /** Move Assignment */
+    CLTuningParametersList &operator=(CLTuningParametersList &&) noexcept(true) = default;
+    /** Destructor */
+    virtual ~CLTuningParametersList() = default;
+
+    // Inherited methods overridden:
+    virtual size_t size() override;
+};
+
+/** Exhaustive list of all possible Tuning parameters (lws) values */
+class CLTuningParametersListExhaustive : public CLTuningParametersList
+{
+public:
+    /** Prevent default constructor calls */
+    CLTuningParametersListExhaustive() = delete;
+    /** Constructor */
+    CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info);
+    /** Copy Constructor */
+    CLTuningParametersListExhaustive(const CLTuningParametersListExhaustive &) = default;
+    /** Move Constructor */
+    CLTuningParametersListExhaustive(CLTuningParametersListExhaustive &&) noexcept(true) = default;
+    /** Assignment */
+    CLTuningParametersListExhaustive &operator=(const CLTuningParametersListExhaustive &) = default;
+    /** Move Assignment */
+    CLTuningParametersListExhaustive &operator=(CLTuningParametersListExhaustive &&) noexcept(true) = default;
+    /** Destructor */
+    ~CLTuningParametersListExhaustive() = default;
+
+    // Inherited methods overridden:
+    CLTuningParams operator[](size_t) override;
+};
+
+/** A subset of LWS values that are either factors of gws when gws[2] < 16 or power of 2 */
+class CLTuningParametersListNormal : public CLTuningParametersList
+{
+public:
+    /** Constructor */
+    CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info);
+    /** Copy Constructor */
+    CLTuningParametersListNormal(const CLTuningParametersListNormal &) = default;
+    /** Move Constructor */
+    CLTuningParametersListNormal(CLTuningParametersListNormal &&) noexcept(true) = default;
+    /** Assignment */
+    CLTuningParametersListNormal &operator=(const CLTuningParametersListNormal &) = default;
+    /** Move Assignment */
+    CLTuningParametersListNormal &operator=(CLTuningParametersListNormal &&) noexcept(true) = default;
+    /** Destructor */
+    ~CLTuningParametersListNormal() = default;
+
+    // Inherited methods overridden:
+    CLTuningParams operator[](size_t) override;
+
+    /** Prevent default constructor calls */
+    CLTuningParametersListNormal() = default;
+
+private:
+    /** Utility function used to initialize the LWS values to test.
+     *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
+     *
+     * @param[in, out] lws         Vector of LWS to test
+     * @param[in]      gws         Size of the specific GWS
+     * @param[in]      lws_max     Max LWS value allowed to be tested
+     * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
+     */
+    void
+    initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
+};
+
+/** A minimal subset of LWS values that only have 1,2 and 4/8 */
+class CLTuningParametersListRapid : public CLTuningParametersListNormal
+{
+public:
+    /** Prevent default constructor calls */
+    CLTuningParametersListRapid() = delete;
+    /** Constructor */
+    CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info);
+    /** Copy Constructor */
+    CLTuningParametersListRapid(const CLTuningParametersListRapid &) = default;
+    /** Move Constructor */
+    CLTuningParametersListRapid(CLTuningParametersListRapid &&) noexcept(true) = default;
+    /** Assignment */
+    CLTuningParametersListRapid &operator=(const CLTuningParametersListRapid &) = default;
+    /** Move Assignment */
+    CLTuningParametersListRapid &operator=(CLTuningParametersListRapid &&) noexcept(true) = default;
+    /** Destructor */
+    virtual ~CLTuningParametersListRapid() = default;
+
+private:
+    /** Utility function used to initialize the LWS values to test.
+     *  Only the LWS values that have 1,2 and 4/8 for each dimension are taken into account by the CLTuner
+     *
+     * @param[in, out] lws     Vector of LWS to test
+     * @param[in]      lws_max Max LWS value allowed to be tested
+     */
+    void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max);
+};
+
+size_t CLTuningParametersList::size()
+{
+    return search_space_shape.total_size();
+}
+
+CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= size());
+    auto coords = index2coords(search_space_shape, index);
+    return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U, static_cast<int>(coords[3]));
+}
+
+CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info)
+{
+    const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+    const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+    const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+    search_space_shape[0] = lws_x_max;
+    search_space_shape[1] = lws_y_max;
+    search_space_shape[2] = lws_z_max;
+    search_space_shape[3] = 1;
+    if (tuning_info.tune_wbsm)
+    {
+        _wbsm                 = {-3, -2, -1, 0, 1, 2, 3};
+        search_space_shape[3] = _wbsm.size();
+    }
+}
+
+CLTuningParams CLTuningParametersListNormal::operator[](size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= size());
+    auto coords = index2coords(search_space_shape, index);
+    return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]], _wbsm[coords[3]]);
+}
+
+CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info)
+{
+    const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+    const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+    const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+    // Initialize the tuning parameters values to test
+    _lws_x = {};
+    _lws_y = {};
+    _lws_z = {};
+    initialize_lws_values(_lws_x, gws[0], lws_x_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_y, gws[1], lws_y_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
+
+    search_space_shape[0] = _lws_x.size();
+    search_space_shape[1] = _lws_y.size();
+    search_space_shape[2] = _lws_z.size();
+    search_space_shape[3] = 1;
+    if (tuning_info.tune_wbsm)
+    {
+        _wbsm                 = {-2, -1, 0, 1, 2};
+        search_space_shape[3] = _wbsm.size();
+    }
+}
+
+void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws,
+                                                         unsigned int               gws,
+                                                         unsigned int               lws_max,
+                                                         bool                       mod_let_one)
+{
+    lws.push_back(1);
+
+    for (unsigned int i = 2; i <= lws_max; ++i)
+    {
+        // Power of two condition
+        const bool is_power_of_two = (i & (i - 1)) == 0;
+
+        // Condition for the module accordingly with the mod_let_one flag
+        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+        if (mod_cond || is_power_of_two)
+        {
+            lws.push_back(i);
+        }
+    }
+}
+
+CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info)
+{
+    const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
+    const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
+    const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
+
+    // Initialize the LWS values to test
+    _lws_x = {};
+    _lws_y = {};
+    _lws_z = {};
+    initialize_lws_values(_lws_x, lws_x_max);
+    initialize_lws_values(_lws_y, lws_y_max);
+    initialize_lws_values(_lws_z, lws_z_max);
+
+    search_space_shape[0] = _lws_x.size();
+    search_space_shape[1] = _lws_y.size();
+    search_space_shape[2] = _lws_z.size();
+    search_space_shape[3] = 1;
+    if (tuning_info.tune_wbsm)
+    {
+        _wbsm                 = {-1, 0, 1};
+        search_space_shape[3] = _wbsm.size();
+    }
+}
+
+void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
+{
+    lws.push_back(1);
+
+    for (unsigned int i = 2; i <= lws_max; i *= 4)
+    {
+        lws.push_back(i);
+    }
+}
+
+std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
+{
+    switch (tuning_info.tuner_mode)
+    {
+        case CLTunerMode::EXHAUSTIVE:
+            return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);
+        case CLTunerMode::NORMAL:
+            return std::make_unique<CLTuningParametersListNormal>(gws, tuning_info);
+        case CLTunerMode::RAPID:
+            return std::make_unique<CLTuningParametersListRapid>(gws, tuning_info);
+        default:
+            return nullptr;
+    }
+}
+} // namespace cl_tuner
+} // namespace arm_compute
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 0a03497cb9..9fbdc3a4dd 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,17 +26,21 @@
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Log.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/Mutex.h"
 
 #include <atomic>
 #include <condition_variable>
 #include <iostream>
 #include <list>
+#include <memory>
 #include <mutex>
 #include <system_error>
 #include <thread>
+#include <vector>
 
 namespace arm_compute
 {
@@ -50,8 +54,7 @@ public:
      * @param[in] start First value that will be returned by the feeder
      * @param[in] end   End condition (The last value returned by get_next() will be end - 1)
      */
-    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0)
-        : _atomic_counter(start), _end(end)
+    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end)
     {
     }
     /** Return the next element in the range if there is one.
@@ -71,61 +74,6 @@ private:
     const unsigned int _end;
 };
 
-/** Given two dimensions and a maxium number of threads to utilise, calcualte the best
- * combination of threads that fit in (mutliplied together) max_threads.
- *
- * This algorithm assumes that work in either of the dimensions is equally difficult
- * to compute
- *
- * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
- */
-std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
-{
-    /*
-     * We want the same ratio of threads in M & N to the ratio of m and n problem size
-     *
-     * Therefore:    mt/nt == m/n    where mt*nt == max_threads
-     *
-     *             max_threads/nt = mt    &    (max_threads/nt) * (m/n) = nt
-     *          nt^2 = max_threads * (m/n)
-     *          nt = sqrt( max_threads * (m/n) )
-     */
-    //ratio of m to n in problem dimensions
-    double ratio = m / static_cast<double>(n);
-
-    // nt = sqrt(max_threads * (m / n) )
-    const unsigned adjusted = std::round(
-                    std::sqrt(max_threads * ratio));
-
-    //find the nearest factor of max_threads
-    for(unsigned i = 0; i!= adjusted; ++i)
-    {
-        //try down
-        const unsigned adj_down = adjusted - i;
-        if(max_threads % adj_down == 0)
-        {
-            return { adj_down, max_threads / adj_down };
-        }
-
-        //try up
-        const unsigned adj_up = adjusted + i;
-        if(max_threads % adj_up == 0)
-        {
-            return { adj_up, max_threads / adj_up };
-        }
-    }
-
-    //we didn't find anything so lets bail out with maxes biased to the largest dimension
-    if(m > n)
-    {
-         return{ std::min<unsigned>(m, max_threads), 1 };
-    }
-    else
-    {
-        return{ 1, std::min<unsigned>(n, max_threads) };
-    }
-}
-
 /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run.
  *
  * Will run workloads until the feeder reaches the end of its range.
@@ -141,51 +89,77 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
     {
         ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size());
         workloads[workload_index](info);
-    }
-    while(feeder.get_next(workload_index));
+    } while (feeder.get_next(workload_index));
 }
 
-} //namespace
-
-struct CPPScheduler::Impl final
+/** Set thread affinity. Pin current thread to a particular core
+ *
+ * @param[in] core_id ID of the core to which the current thread is pinned
+ */
+void set_thread_affinity(int core_id)
 {
-    explicit Impl(unsigned int thread_hint)
-        : _num_threads(thread_hint), _threads(_num_threads - 1)
-    {
-    }
-    void set_num_threads(unsigned int num_threads, unsigned int thead_hint)
+    if (core_id < 0)
     {
-        _num_threads = num_threads == 0 ? thead_hint : num_threads;
-        _threads.resize(_num_threads - 1);
-    }
-    unsigned int num_threads() const
-    {
-        return _num_threads;
+        return;
     }
 
-    void run_workloads(std::vector<IScheduler::Workload> &workloads);
-
-    class Thread;
+#if !defined(_WIN64) && !defined(__APPLE__) && !defined(__OpenBSD__)
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    CPU_SET(core_id, &set);
+    ARM_COMPUTE_EXIT_ON_MSG(sched_setaffinity(0, sizeof(set), &set), "Error setting thread affinity");
+#endif /* !defined(__APPLE__) && !defined(__OpenBSD__) */
+}
 
-    unsigned int       _num_threads;
-    std::list<Thread>  _threads;
-    arm_compute::Mutex _run_workloads_mutex{};
-};
+/** There are currently 2 scheduling modes supported by CPPScheduler
+ *
+ * Linear:
+ *  The default mode where all the scheduling is carried out by the main thread linearly (in a loop).
+ *  E.G. If there are 8 threads in total, there will be 1 main thread + 7 threads in the thread pool, and it is main
+ *  thread's responsibility to start all the other threads in the thread pool.
+ *
+ * Fanout:
+ *  In fanout mode, the scheduling (starting other threads) task is distributed across many threads instead of just
+ *  the main thread.
+ *
+ *  The scheduler has a fixed parameter: wake_fanout, and the scheduling sequence goes like this:
+ *  1. Main thread wakes the first wake_fanout - 1 number of FanoutThreads from the thread pool
+ *      From thread: 0
+ *      To thread (non-inclusive): Wake_fanout - 1
+ *  2. Each FanoutThread then wakes wake_fanout number of FanoutThreads from the thread pool:
+ *      From thread: (i + 1) * wake_fanout - 1
+ *      To thread (non-inclusive): (i + 2) * wake_fanout - 1
+ *      where i is the current thread's thread id
+ *      The end is clamped at the size of the thread pool / the number of threads in use - 1
+ *
+ *  E.G. for a total number of 8 threads (1 main thread, 7 FanoutThreads in thread pool) with a fanout of 3
+ *  1. Main thread wakes FanoutThread 0, 1
+ *  2. FanoutThread 0 wakes FanoutThread 2, 3, 4
+ *  3. FanoutThread 1 wakes FanoutThread 5, 6
+ */
 
-class CPPScheduler::Impl::Thread final
+class Thread final
 {
 public:
-    /** Start a new thread. */
-    Thread();
+    /** Start a new thread
+     *
+     * Thread will be pinned to a given core id if value is non-negative
+     *
+     * @param[in] core_pin Core id to pin the thread on. If negative no thread pinning will take place
+     */
+    explicit Thread(int core_pin = -1);
 
-    Thread(const Thread &) = delete;
+    Thread(const Thread &)            = delete;
     Thread &operator=(const Thread &) = delete;
     Thread(Thread &&)                 = delete;
-    Thread &operator=(Thread &&) = delete;
+    Thread &operator=(Thread &&)      = delete;
 
     /** Destructor. Make the thread join. */
     ~Thread();
 
+    /** Set workloads */
+    void set_workload(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info);
+
     /** Request the worker thread to start executing workloads.
      *
      * The thread will start by executing workloads[info.thread_id] and will then call the feeder to
@@ -194,47 +168,72 @@ public:
      * @note This function will return as soon as the workloads have been sent to the worker thread.
      * wait() needs to be called to ensure the execution is complete.
      */
-    void start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info);
+    void start();
 
     /** Wait for the current kernel execution to complete. */
-    void wait();
+    std::exception_ptr wait();
 
     /** Function ran by the worker thread. */
     void worker_thread();
 
+    /** Set the scheduling strategy to be linear */
+    void set_linear_mode()
+    {
+        _thread_pool = nullptr;
+        _wake_beg    = 0;
+        _wake_end    = 0;
+    }
+
+    /** Set the scheduling strategy to be fanout */
+    void set_fanout_mode(std::list<Thread> *thread_pool, unsigned int wake_beg, unsigned int wake_end)
+    {
+        _thread_pool = thread_pool;
+        _wake_beg    = wake_beg;
+        _wake_end    = wake_end;
+    }
+
 private:
     std::thread                        _thread{};
     ThreadInfo                         _info{};
-    std::vector<IScheduler::Workload> *_workloads{ nullptr };
-    ThreadFeeder                      *_feeder{ nullptr };
+    std::vector<IScheduler::Workload> *_workloads{nullptr};
+    ThreadFeeder                      *_feeder{nullptr};
     std::mutex                         _m{};
     std::condition_variable            _cv{};
-    bool                               _wait_for_work{ false };
-    bool                               _job_complete{ true };
-    std::exception_ptr                 _current_exception{ nullptr };
+    bool                               _wait_for_work{false};
+    bool                               _job_complete{true};
+    std::exception_ptr                 _current_exception{nullptr};
+    int                                _core_pin{-1};
+    std::list<Thread>                 *_thread_pool{nullptr};
+    unsigned int                       _wake_beg{0};
+    unsigned int                       _wake_end{0};
 };
 
-CPPScheduler::Impl::Thread::Thread()
+Thread::Thread(int core_pin) : _core_pin(core_pin)
 {
     _thread = std::thread(&Thread::worker_thread, this);
 }
 
-CPPScheduler::Impl::Thread::~Thread()
+Thread::~Thread()
 {
     // Make sure worker thread has ended
-    if(_thread.joinable())
+    if (_thread.joinable())
     {
         ThreadFeeder feeder;
-        start(nullptr, feeder, ThreadInfo());
+        set_workload(nullptr, feeder, ThreadInfo());
+        start();
         _thread.join();
     }
 }
 
-void CPPScheduler::Impl::Thread::start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info)
+void Thread::set_workload(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info)
 {
     _workloads = workloads;
     _feeder    = &feeder;
     _info      = info;
+}
+
+void Thread::start()
+{
     {
         std::lock_guard<std::mutex> lock(_m);
         _wait_for_work = true;
@@ -243,22 +242,20 @@ void CPPScheduler::Impl::Thread::start(std::vector<IScheduler::Workload> *worklo
     _cv.notify_one();
 }
 
-void CPPScheduler::Impl::Thread::wait()
+std::exception_ptr Thread::wait()
 {
     {
         std::unique_lock<std::mutex> lock(_m);
         _cv.wait(lock, [&] { return _job_complete; });
     }
-
-    if(_current_exception)
-    {
-        std::rethrow_exception(_current_exception);
-    }
+    return _current_exception;
 }
 
-void CPPScheduler::Impl::Thread::worker_thread()
+void Thread::worker_thread()
 {
-    while(true)
+    set_thread_affinity(_core_pin);
+
+    while (true)
     {
         std::unique_lock<std::mutex> lock(_m);
         _cv.wait(lock, [&] { return _wait_for_work; });
@@ -266,12 +263,24 @@ void CPPScheduler::Impl::Thread::worker_thread()
 
         _current_exception = nullptr;
 
-        // Time to exit
-        if(_workloads == nullptr)
+        // Exit if the worker thread has not been fed with workloads
+        if (_workloads == nullptr || _feeder == nullptr)
         {
             return;
         }
 
+        // Wake up more peer threads from thread pool if this job has been delegated to the current thread
+        if (_thread_pool != nullptr)
+        {
+            auto thread_it = _thread_pool->begin();
+            std::advance(thread_it, std::min(static_cast<unsigned int>(_thread_pool->size()), _wake_beg));
+            auto wake_end = std::min(_wake_end, static_cast<unsigned int>(_info.num_threads - 1));
+            for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
+            {
+                thread_it->start();
+            }
+        }
+
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         try
         {
@@ -280,19 +289,142 @@ void CPPScheduler::Impl::Thread::worker_thread()
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         }
-        catch(...)
+        catch (...)
         {
             _current_exception = std::current_exception();
         }
 #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+        _workloads    = nullptr;
         _job_complete = true;
         lock.unlock();
         _cv.notify_one();
     }
 }
+} //namespace
+
+struct CPPScheduler::Impl final
+{
+    constexpr static unsigned int m_default_wake_fanout = 4;
+    enum class Mode
+    {
+        Linear,
+        Fanout
+    };
+    enum class ModeToggle
+    {
+        None,
+        Linear,
+        Fanout
+    };
+    explicit Impl(unsigned int thread_hint)
+        : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U)
+    {
+        const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE"));
+        if (mode_env_v == "linear")
+        {
+            _forced_mode = ModeToggle::Linear;
+        }
+        else if (mode_env_v == "fanout")
+        {
+            _forced_mode = ModeToggle::Fanout;
+        }
+        else
+        {
+            _forced_mode = ModeToggle::None;
+        }
+    }
+    void set_num_threads(unsigned int num_threads, unsigned int thread_hint)
+    {
+        _num_threads = num_threads == 0 ? thread_hint : num_threads;
+        _threads.resize(_num_threads - 1);
+        auto_switch_mode(_num_threads);
+    }
+    void set_num_threads_with_affinity(unsigned int num_threads, unsigned int thread_hint, BindFunc func)
+    {
+        _num_threads = num_threads == 0 ? thread_hint : num_threads;
+
+        // Set affinity on main thread
+        set_thread_affinity(func(0, thread_hint));
+
+        // Set affinity on worked threads
+        _threads.clear();
+        for (auto i = 1U; i < _num_threads; ++i)
+        {
+            _threads.emplace_back(func(i, thread_hint));
+        }
+        auto_switch_mode(_num_threads);
+    }
+    void auto_switch_mode(unsigned int num_threads_to_use)
+    {
+        // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use
+        if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
+        {
+            set_fanout_mode(m_default_wake_fanout, num_threads_to_use);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n",
+                this->wake_fanout(), num_threads_to_use);
+        }
+        else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8))
+        {
+            set_linear_mode();
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n",
+                                                      num_threads_to_use);
+        }
+    }
+    void set_linear_mode()
+    {
+        for (auto &thread : _threads)
+        {
+            thread.set_linear_mode();
+        }
+        _mode        = Mode::Linear;
+        _wake_fanout = 0U;
+    }
+    void set_fanout_mode(unsigned int wake_fanout, unsigned int num_threads_to_use)
+    {
+        ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1);
+        const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1));
+        auto       thread_it          = _threads.begin();
+        for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
+        {
+            const auto wake_begin = i * actual_wake_fanout - 1;
+            const auto wake_end   = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1);
+            thread_it->set_fanout_mode(&_threads, wake_begin, wake_end);
+        }
+        // Reset the remaining threads's wake up schedule
+        while (thread_it != _threads.end())
+        {
+            thread_it->set_fanout_mode(&_threads, 0U, 0U);
+            ++thread_it;
+        }
+        _mode        = Mode::Fanout;
+        _wake_fanout = actual_wake_fanout;
+    }
+    unsigned int num_threads() const
+    {
+        return _num_threads;
+    }
+    unsigned int wake_fanout() const
+    {
+        return _wake_fanout;
+    }
+    Mode mode() const
+    {
+        return _mode;
+    }
+
+    void run_workloads(std::vector<IScheduler::Workload> &workloads);
+
+    unsigned int       _num_threads;
+    std::list<Thread>  _threads;
+    arm_compute::Mutex _run_workloads_mutex{};
+    Mode               _mode{Mode::Linear};
+    ModeToggle         _forced_mode{ModeToggle::None};
+    unsigned int       _wake_fanout{0};
+};
 
 /*
- * This singleton has been deprecated and will be removed in the next release
+ * This singleton has been deprecated and will be removed in future releases
  */
 CPPScheduler &CPPScheduler::get()
 {
@@ -300,8 +432,7 @@ CPPScheduler &CPPScheduler::get()
     return scheduler;
 }
 
-CPPScheduler::CPPScheduler()
-    : _impl(support::cpp14::make_unique<Impl>(num_threads_hint()))
+CPPScheduler::CPPScheduler() : _impl(std::make_unique<Impl>(num_threads_hint()))
 {
 }
 
@@ -314,6 +445,13 @@ void CPPScheduler::set_num_threads(unsigned int num_threads)
     _impl->set_num_threads(num_threads, num_threads_hint());
 }
 
+void CPPScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func)
+{
+    // No changes in the number of threads while current workloads are running
+    arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex);
+    _impl->set_num_threads_with_affinity(num_threads, num_threads_hint(), func);
+}
+
 unsigned int CPPScheduler::num_threads() const
 {
     return _impl->num_threads();
@@ -327,137 +465,93 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
     // This is not great because different threads workloads won't run in parallel but at least they
     // won't interfere each other and deadlock.
     arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex);
-    const unsigned int                  num_threads = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
-    if(num_threads < 1)
+    const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
+    if (num_threads_to_use < 1)
     {
         return;
     }
-    ThreadFeeder feeder(num_threads, workloads.size());
+    // Re-adjust the mode if the actual number of threads to use is different from the number of threads created
+    _impl->auto_switch_mode(num_threads_to_use);
+    int num_threads_to_start = 0;
+    switch (_impl->mode())
+    {
+        case CPPScheduler::Impl::Mode::Fanout:
+        {
+            num_threads_to_start = static_cast<int>(_impl->wake_fanout()) - 1;
+            break;
+        }
+        case CPPScheduler::Impl::Mode::Linear:
+        default:
+        {
+            num_threads_to_start = static_cast<int>(num_threads_to_use) - 1;
+            break;
+        }
+    }
+    ThreadFeeder feeder(num_threads_to_use, workloads.size());
     ThreadInfo   info;
-    info.cpu_info          = &_cpu_info;
-    info.num_threads       = num_threads;
+    info.cpu_info          = &cpu_info();
+    info.num_threads       = num_threads_to_use;
     unsigned int t         = 0;
     auto         thread_it = _impl->_threads.begin();
-    for(; t < num_threads - 1; ++t, ++thread_it)
+    // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread
+    for (; t < num_threads_to_use - 1; ++t, ++thread_it)
     {
         info.thread_id = t;
-        thread_it->start(&workloads, feeder, info);
+        thread_it->set_workload(&workloads, feeder, info);
     }
-
-    info.thread_id = t;
-    process_workloads(workloads, feeder, info);
+    thread_it = _impl->_threads.begin();
+    for (int i = 0; i < num_threads_to_start; ++i, ++thread_it)
+    {
+        thread_it->start();
+    }
+    info.thread_id                    = t; // Set main thread's thread_id
+    std::exception_ptr last_exception = nullptr;
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-        for(auto &thread : _impl->_threads)
-        {
-            thread.wait();
-        }
+#endif                                              /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+        process_workloads(workloads, feeder, info); // Main thread processes workloads
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::system_error &e)
+    catch (...)
     {
-        std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+        last_exception = std::current_exception();
     }
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-}
-#endif /* DOXYGEN_SKIP_THIS */
 
-void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
-
-    const Window      &max_window     = kernel->window();
-
-    if(hints.split_dimension() == IScheduler::split_dimensions_all)
+    try
     {
-        /*
-         * if the split dim is size_t max then this signals we should parallelise over
-         * all dimensions
-         */
-        const std::size_t m = max_window.num_iterations(Window::DimX);
-        const std::size_t n = max_window.num_iterations(Window::DimY);
-
-       //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
-        unsigned m_threads, n_threads;
-        std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
-
-        std::vector<IScheduler::Workload> workloads;
-        for(unsigned int ni  = 0; ni != n_threads; ++ni)
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+        thread_it = _impl->_threads.begin();
+        for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
         {
-            for(unsigned int mi  = 0; mi != m_threads; ++mi)
+            std::exception_ptr current_exception = thread_it->wait();
+            if (current_exception)
             {
-                workloads.push_back(
-                    [ ni, mi, m_threads, n_threads, &max_window, &kernel ]
-                    (const ThreadInfo & info)
-                    {
-                        //narrow the window to our mi-ni workload
-                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
-                                               .split_window(Window::DimY, ni, n_threads);
-
-                        win.validate();
-
-                        Window thread_locator;
-                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
-                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
-
-                        thread_locator.validate();
-
-                        kernel->run_nd(win, info, thread_locator);
-                    }
-                );
+                last_exception = current_exception;
             }
         }
-        run_workloads(workloads);
+        if (last_exception)
+        {
+            std::rethrow_exception(last_exception);
+        }
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    else
+    catch (const std::system_error &e)
     {
-        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-        const unsigned int num_threads    = std::min(num_iterations, _impl->_num_threads);
+        std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+    }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+}
+#endif /* DOXYGEN_SKIP_THIS */
 
-        if(num_iterations == 0)
-        {
-            return;
-        }
+void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)
+{
+    schedule_common(kernel, hints, window, tensors);
+}
 
-        if(!kernel->is_parallelisable() || num_threads == 1)
-        {
-            ThreadInfo info;
-            info.cpu_info = &_cpu_info;
-            kernel->run(max_window, info);
-        }
-        else
-        {
-            unsigned int num_windows = 0;
-            switch(hints.strategy())
-            {
-                case StrategyHint::STATIC:
-                    num_windows = num_threads;
-                    break;
-                case StrategyHint::DYNAMIC:
-                {
-                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
-                    // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
-                    num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Unknown strategy");
-            }
-            std::vector<IScheduler::Workload> workloads(num_windows);
-            for(unsigned int t = 0; t < num_windows; t++)
-            {
-                //Capture 't' by copy, all the other variables by reference:
-                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
-                {
-                    Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
-                    win.validate();
-                    kernel->run(win, info);
-                };
-            }
-            run_workloads(workloads);
-        }
-    }
+void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
+{
+    ITensorPack tensors;
+    schedule_common(kernel, hints, kernel->window(), tensors);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/ICPPSimpleFunction.cpp b/src/runtime/CPP/ICPPSimpleFunction.cpp
index 42a2d2228c..f4fef11acc 100644
--- a/src/runtime/CPP/ICPPSimpleFunction.cpp
+++ b/src/runtime/CPP/ICPPSimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 660a79652c..c46a2731d8 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,23 +37,38 @@ void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
 
 void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
-    const Window      &max_window     = kernel->window();
-    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-    if(num_iterations < 1)
+    const Window &max_window = kernel->window();
+
+    if (hints.split_dimension() != IScheduler::split_dimensions_all)
     {
-        return;
+        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+        if (num_iterations < 1)
+        {
+            return;
+        }
     }
 
     ThreadInfo info;
-    info.cpu_info = &_cpu_info;
+    info.cpu_info = &cpu_info();
     kernel->run(kernel->window(), info);
 }
 
+void SingleThreadScheduler::schedule_op(ICPPKernel   *kernel,
+                                        const Hints  &hints,
+                                        const Window &window,
+                                        ITensorPack  &tensors)
+{
+    ARM_COMPUTE_UNUSED(hints);
+    ThreadInfo info;
+    info.cpu_info = &cpu_info();
+    kernel->run_op(tensors, window, info);
+}
+
 void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads)
 {
     ThreadInfo info;
-    info.cpu_info = &_cpu_info;
-    for(auto &wl : workloads)
+    info.cpu_info = &cpu_info();
+    for (auto &wl : workloads)
     {
         wl(info);
     }
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index 232f71dbea..94a1673d59 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 #include "arm_compute/runtime/Scheduler.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace
@@ -40,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output)
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -78,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output)
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint8_t *>(output_it.ptr()) =
+                        quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<int8_t *>(output_it.ptr()) =
+                        quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint16_t *>(output_it.ptr()) =
+                        quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -130,12 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh
 {
 }
 
-void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                                                    ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor        *scores_in,
+                                                    const ITensor        *boxes_in,
+                                                    const ITensor        *batch_splits_in,
+                                                    ITensor              *scores_out,
+                                                    ITensor              *boxes_out,
+                                                    ITensor              *classes,
+                                                    ITensor              *batch_splits_out,
+                                                    ITensor              *keeps,
+                                                    ITensor              *keeps_size,
+                                                    const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
+    ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out,
+                           keeps, keeps_size, info);
 
-    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
+    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 ||
+                  scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
 
     _scores_in        = scores_in;
     _boxes_in         = boxes_in;
@@ -146,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
     _batch_splits_out = batch_splits_out;
     _keeps            = keeps;
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         // Manage intermediate buffers
         _memory_group.manage(&_scores_in_f32);
@@ -156,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
         _memory_group.manage(&_classes_f32);
         _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
         _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_in != nullptr)
+        if (batch_splits_in != nullptr)
         {
             _memory_group.manage(&_batch_splits_in_f32);
             _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
@@ -164,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
         _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
         _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
         _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _memory_group.manage(&_batch_splits_out_f32);
             _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _memory_group.manage(&_keeps_f32);
             _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
         }
 
-        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
+        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32,
+                                             (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
                                              &_scores_out_f32, &_boxes_out_f32, &_classes_f32,
-                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
-                                             keeps_size, info);
+                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr,
+                                             (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info);
     }
     else
     {
-        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes,
+                                             batch_splits_out, keeps, keeps_size, info);
     }
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _scores_in_f32.allocator()->allocate();
         _boxes_in_f32.allocator()->allocate();
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             _batch_splits_in_f32.allocator()->allocate();
         }
         _scores_out_f32.allocator()->allocate();
         _boxes_out_f32.allocator()->allocate();
         _classes_f32.allocator()->allocate();
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _batch_splits_out_f32.allocator()->allocate();
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _keeps_f32.allocator()->allocate();
         }
     }
 }
 
-Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
-                const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
+Status validate(const ITensorInfo    *scores_in,
+                const ITensorInfo    *boxes_in,
+                const ITensorInfo    *batch_splits_in,
+                const ITensorInfo    *scores_out,
+                const ITensorInfo    *boxes_out,
+                const ITensorInfo    *classes,
+                const ITensorInfo    *batch_splits_out,
+                const ITensorInfo    *keeps,
+                const ITensorInfo    *keeps_size,
+                const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
-    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
-    if(is_qasymm8)
+    const bool is_qasymm8 =
+        scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
@@ -233,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         dequantize_tensor(_scores_in, &_scores_in_f32);
         dequantize_tensor(_boxes_in, &_boxes_in_f32);
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32);
         }
@@ -245,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
 
     Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         quantize_tensor(&_scores_out_f32, _scores_out);
         quantize_tensor(&_boxes_out_f32, _boxes_out);
         quantize_tensor(&_classes_f32, _classes);
-        if(_batch_splits_out != nullptr)
+        if (_batch_splits_out != nullptr)
         {
             quantize_tensor(&_batch_splits_out_f32, _batch_splits_out);
         }
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             quantize_tensor(&_keeps_f32, _keeps);
         }
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 4ec0ab6c1a..e6291f973e 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,31 +27,44 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <list>
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status validate_arguments(const ITensorInfo       *input_loc,
+                          const ITensorInfo       *input_conf,
+                          const ITensorInfo       *input_priorbox,
+                          const ITensorInfo       *output,
+                          DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3,
+                                    "The priorbox input tensor should be [C3, 2, N].");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
 
     const int num_priors = input_priorbox->tensor_shape()[0] / 4;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) !=
+                                        input_loc->tensor_shape()[0],
+                                    "Number of priors must match number of location predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) !=
+                                        input_conf->tensor_shape()[0],
+                                    "Number of priors must match number of confidence predictions.");
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+        const unsigned int max_size =
+            info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
     }
@@ -62,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input
 /** Function used to sort pair<float, T> in descend order based on the score (first) value.
  */
 template <typename T>
-bool SortScorePairDescend(const std::pair<float, T> &pair1,
-                          const std::pair<float, T> &pair2)
+bool SortScorePairDescend(const std::pair<float, T> &pair1, const std::pair<float, T> &pair2)
 {
     return pair1.first > pair2.first;
 }
@@ -79,16 +91,19 @@ bool SortScorePairDescend(const std::pair<float, T> &pair1,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
-                                  const int num_priors, const int num_loc_classes,
-                                  const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+void retrieve_all_loc_predictions(const ITensor          *input_loc,
+                                  const int               num,
+                                  const int               num_priors,
+                                  const int               num_loc_classes,
+                                  const bool              share_location,
+                                  std::vector<LabelBBox> &all_location_predictions)
 {
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_loc_classes; ++c)
+        for (int c = 0; c < num_loc_classes; ++c)
         {
             int label = share_location ? -1 : c;
-            if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+            if (all_location_predictions[i].find(label) == all_location_predictions[i].end())
             {
                 all_location_predictions[i][label].resize(num_priors);
             }
@@ -99,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int p = 0; p < num_priors; ++p)
+        for (int p = 0; p < num_priors; ++p)
         {
-            for(int c = 0; c < num_loc_classes; ++c)
+            for (int c = 0; c < num_loc_classes; ++c)
             {
                 const int label    = share_location ? -1 : c;
                 const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
                 //xmin, ymin, xmax, ymax
-                all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
-                all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
-                all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
-                all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+                all_location_predictions[i][label][p][0] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+                all_location_predictions[i][label][p][1] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+                all_location_predictions[i][label][p][2] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+                all_location_predictions[i][label][p][3] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
             }
         }
     }
@@ -127,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
-                              const int num_priors, const int                 num_classes,
+void retrieve_all_conf_scores(const ITensor                                  *input_conf,
+                              const int                                       num,
+                              const int                                       num_priors,
+                              const int                                       num_classes,
                               std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
 {
     std::vector<float> tmp_buffer;
     tmp_buffer.resize(num * num_priors * num_classes);
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
-            for(int p = 0; p < num_priors; ++p)
+            for (int p = 0; p < num_priors; ++p)
             {
-                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
-                    *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast<float *>(
+                    input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
             all_confidence_scores[i][c].resize(num_priors);
             all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
@@ -165,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_priorbox(const ITensor     *input_priorbox,
-                           const int          num_priors,
-                           std::vector<BBox> &all_prior_bboxes,
+void retrieve_all_priorbox(const ITensor                     *input_priorbox,
+                           const int                          num_priors,
+                           std::vector<BBox>                 &all_prior_bboxes,
                            std::vector<std::array<float, 4>> &all_prior_variances)
 {
-    for(int i = 0; i < num_priors; ++i)
+    for (int i = 0; i < num_priors; ++i)
     {
-        all_prior_bboxes[i] =
-        {
-            {
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
-            }
-        };
+        all_prior_bboxes[i] = {{*reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}};
     }
 
-    std::array<float, 4> var({ { 0, 0, 0, 0 } });
-    for(int i = 0; i < num_priors; ++i)
+    std::array<float, 4> var({{0, 0, 0, 0}});
+    for (int i = 0; i < num_priors; ++i)
     {
-        for(int j = 0; j < 4; ++j)
+        for (int j = 0; j < 4; ++j)
         {
             var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
         }
@@ -205,13 +221,17 @@ void retrieve_all_priorbox(const ITensor     *input_priorbox,
  * @param[out] decode_bbox                The decoded bboxes.
  *
  */
-void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_variance,
-                const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
-                const bool clip_bbox, const BBox &bbox, BBox &decode_bbox)
+void DecodeBBox(const BBox                        &prior_bbox,
+                const std::array<float, 4>        &prior_variance,
+                const DetectionOutputLayerCodeType code_type,
+                const bool                         variance_encoded_in_target,
+                const bool                         clip_bbox,
+                const BBox                        &bbox,
+                BBox                              &decode_bbox)
 {
     // if the variance is encoded in target, we simply need to add the offset predictions
     // otherwise we need to scale the offset accordingly.
-    switch(code_type)
+    switch (code_type)
     {
         case DetectionOutputLayerCodeType::CORNER:
         {
@@ -234,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
             const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
 
-            const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
-            const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
-            const float decode_bbox_width    = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
-            const float decode_bbox_height   = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+            const float decode_bbox_center_x =
+                (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+            const float decode_bbox_center_y =
+                (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+            const float decode_bbox_width =
+                (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+            const float decode_bbox_height =
+                (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
 
             decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
             decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
@@ -255,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
             ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
 
-            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
-            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
-            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
-            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+            decode_bbox[0] =
+                prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+            decode_bbox[1] =
+                prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+            decode_bbox[2] =
+                prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+            decode_bbox[3] =
+                prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
 
             break;
         }
@@ -266,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
     }
 
-    if(clip_bbox)
+    if (clip_bbox)
     {
-        for(auto &d_bbox : decode_bbox)
+        for (auto &d_bbox : decode_bbox)
         {
             d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
         }
@@ -286,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
  * @param[out] indices         The kept indices of bboxes after nms.
  *
  */
-void ApplyNMSFast(const std::vector<BBox> &bboxes,
-                  const std::vector<float> &scores, const float score_threshold,
-                  const float nms_threshold, const float eta, const int top_k,
-                  std::vector<int> &indices)
+void ApplyNMSFast(const std::vector<BBox>  &bboxes,
+                  const std::vector<float> &scores,
+                  const float               score_threshold,
+                  const float               nms_threshold,
+                  const float               eta,
+                  const int                 top_k,
+                  std::vector<int>         &indices)
 {
     ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
 
@@ -297,9 +328,9 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
     std::list<std::pair<float, int>> score_index_vec;
 
     // Generate index score pairs.
-    for(size_t i = 0; i < scores.size(); ++i)
+    for (size_t i = 0; i < scores.size(); ++i)
     {
-        if(scores[i] > score_threshold)
+        if (scores[i] > score_threshold)
         {
             score_index_vec.emplace_back(std::make_pair(scores[i], i));
         }
@@ -310,7 +341,7 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
 
     // Keep top_k scores if needed.
     const int score_index_vec_size = score_index_vec.size();
-    if(top_k > -1 && top_k < score_index_vec_size)
+    if (top_k > -1 && top_k < score_index_vec_size)
     {
         score_index_vec.resize(top_k);
     }
@@ -319,46 +350,45 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
     float adaptive_threshold = nms_threshold;
     indices.clear();
 
-    while(!score_index_vec.empty())
+    while (!score_index_vec.empty())
     {
         const int idx  = score_index_vec.front().second;
         bool      keep = true;
-        for(int kept_idx : indices)
+        for (int kept_idx : indices)
         {
-            if(keep)
+            if (keep)
             {
                 // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
-                BBox intersect_bbox = std::array<float, 4>({ 0, 0, 0, 0 });
-                if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+                BBox intersect_bbox = std::array<float, 4>({0, 0, 0, 0});
+                if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] ||
+                    bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
                 {
-                    intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                    intersect_bbox = std::array<float, 4>({{0, 0, 0, 0}});
                 }
                 else
                 {
-                    intersect_bbox = std::array<float, 4>({ {
-                            std::max(bboxes[idx][0], bboxes[kept_idx][0]),
-                            std::max(bboxes[idx][1], bboxes[kept_idx][1]),
-                            std::min(bboxes[idx][2], bboxes[kept_idx][2]),
-                            std::min(bboxes[idx][3], bboxes[kept_idx][3])
-                        }
-                    });
+                    intersect_bbox = std::array<float, 4>(
+                        {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+                          std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+                          std::min(bboxes[idx][3], bboxes[kept_idx][3])}});
                 }
 
                 float intersect_width  = intersect_bbox[2] - intersect_bbox[0];
                 float intersect_height = intersect_bbox[3] - intersect_bbox[1];
 
                 float overlap = 0.f;
-                if(intersect_width > 0 && intersect_height > 0)
+                if (intersect_width > 0 && intersect_height > 0)
                 {
                     float intersect_size = intersect_width * intersect_height;
-                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0]
-                                            || bboxes[idx][3] < bboxes[idx][1]) ?
-                                           0.f :
-                                           (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
-                    float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
-                                        || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
-                                       0.f :
-                                       (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1])
+                                               ? 0.f
+                                               : (bboxes[idx][2] - bboxes[idx][0]) *
+                                                 (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+                    float bbox2_size =
+                        (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1])
+                            ? 0.f
+                            : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) *
+                                  (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
                     overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
                 }
                 keep = (overlap <= adaptive_threshold);
@@ -368,12 +398,12 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
                 break;
             }
         }
-        if(keep)
+        if (keep)
         {
             indices.push_back(idx);
         }
         score_index_vec.erase(score_index_vec.begin());
-        if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+        if (keep && eta < 1.f && adaptive_threshold > 0.5f)
         {
             adaptive_threshold *= eta;
         }
@@ -382,23 +412,42 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
 } // namespace
 
 CPPDetectionOutputLayer::CPPDetectionOutputLayer()
-    : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
-      _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+    : _input_loc(nullptr),
+      _input_conf(nullptr),
+      _input_priorbox(nullptr),
+      _output(nullptr),
+      _info(),
+      _num_priors(),
+      _num(),
+      _all_location_predictions(),
+      _all_confidence_scores(),
+      _all_prior_bboxes(),
+      _all_prior_variances(),
+      _all_decode_bboxes(),
+      _all_indices()
 {
 }
 
-void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info)
+void CPPDetectionOutputLayer::configure(const ITensor           *input_loc,
+                                        const ITensor           *input_conf,
+                                        const ITensor           *input_priorbox,
+                                        ITensor                 *output,
+                                        DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+    ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info);
+
     // Output auto initialization if not yet initialized
     // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
     // The maximum is keep_top_k * input_loc_size[1]
     // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
-    const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+    const unsigned int max_size =
+        info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
     auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
 
     _input_loc      = input_loc;
     _input_conf     = input_conf;
@@ -414,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     _all_prior_variances.resize(_num_priors);
     _all_decode_bboxes.resize(_num);
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
@@ -434,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 }
 
-Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status CPPDetectionOutputLayer::validate(const ITensorInfo       *input_loc,
+                                         const ITensorInfo       *input_conf,
+                                         const ITensorInfo       *input_priorbox,
+                                         const ITensorInfo       *output,
+                                         DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info));
     return Status{};
@@ -443,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe
 void CPPDetectionOutputLayer::run()
 {
     // Retrieve all location predictions.
-    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(),
+                                 _all_location_predictions);
 
     // Retrieve all confidences.
     retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
@@ -453,75 +507,79 @@ void CPPDetectionOutputLayer::run()
 
     // Decode all loc predictions to bboxes
     const bool clip_bbox = false;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
             }
-            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(),
+                                         "Could not find location predictions for label %d.", label);
 
             const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
 
             const int num_bboxes = _all_prior_bboxes.size();
             ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
 
-            for(int j = 0; j < num_bboxes; ++j)
+            for (int j = 0; j < num_bboxes; ++j)
             {
-                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(),
+                           _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j],
+                           _all_decode_bboxes[i][label][j]);
             }
         }
     }
 
     int num_kept = 0;
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
 
         std::map<int, std::vector<int>> indices;
-        int num_det = 0;
-        for(int c = 0; c < _info.num_classes(); ++c)
+        int                             num_det = 0;
+        for (int c = 0; c < _info.num_classes(); ++c)
         {
-            if(c == _info.background_label_id())
+            if (c == _info.background_label_id())
             {
                 // Ignore background class
                 continue;
             }
             const int label = _info.share_location() ? -1 : c;
-            if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+            if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
             {
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
             }
             const std::vector<float> &scores = conf_scores.find(c)->second;
-            const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second;
+            const std::vector<BBox>  &bboxes = decode_bboxes.find(label)->second;
 
-            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(),
+                         _info.top_k(), indices[c]);
 
             num_det += indices[c].size();
         }
 
         int num_to_add = 0;
-        if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+        if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
         {
             std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-            for(auto const &it : indices)
+            for (auto const &it : indices)
             {
                 const int               label         = it.first;
                 const std::vector<int> &label_indices = it.second;
 
-                if(conf_scores.find(label) == conf_scores.end())
+                if (conf_scores.find(label) == conf_scores.end())
                 {
                     ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
                 }
 
                 const std::vector<float> &scores = conf_scores.find(label)->second;
-                for(auto idx : label_indices)
+                for (auto idx : label_indices)
                 {
                     ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
                     score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
@@ -535,7 +593,7 @@ void CPPDetectionOutputLayer::run()
             // Store the new indices.
 
             std::map<int, std::vector<int>> new_indices;
-            for(auto score_index_pair : score_index_pairs)
+            for (auto score_index_pair : score_index_pairs)
             {
                 int label = score_index_pair.second.first;
                 int idx   = score_index_pair.second.second;
@@ -556,25 +614,25 @@ void CPPDetectionOutputLayer::run()
     _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
 
     int count = 0;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        for(auto &it : _all_indices[i])
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        for (auto &it : _all_indices[i])
         {
             const int                 label     = it.first;
             const std::vector<float> &scores    = conf_scores.find(label)->second;
             const int                 loc_label = _info.share_location() ? -1 : label;
-            if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+            if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
             {
                 // Either if there are no confidence predictions
                 // or there are no location predictions for current label.
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label);
             }
             const std::vector<BBox> &bboxes  = decode_bboxes.find(loc_label)->second;
-            const std::vector<int> &indices = it.second;
+            const std::vector<int>  &indices = it.second;
 
-            for(auto idx : indices)
+            for (auto idx : indices)
             {
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7))))     = i;
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index b3fc9c776d..2861d6cacb 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <cstddef>
 #include <ios>
 #include <list>
@@ -35,53 +38,76 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                          ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
-                          DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
+Status validate_arguments(const ITensorInfo            *input_box_encoding,
+                          const ITensorInfo            *input_class_score,
+                          const ITensorInfo            *input_anchors,
+                          ITensorInfo                  *output_boxes,
+                          ITensorInfo                  *output_classes,
+                          ITensorInfo                  *output_scores,
+                          ITensorInfo                  *num_detection,
+                          DetectionPostProcessLayerInfo info,
+                          const unsigned int            kBatchSize,
+                          const unsigned int            kNumCoordBox)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
-    if(input_box_encoding->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3,
+                                    "The location input tensor shape should be [4, N, kBatchSize].");
+    if (input_box_encoding->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(
+            input_box_encoding->dimension(2) != kBatchSize,
+            "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1),
-                                    "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize].");
-    if(input_anchors->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox,
+                                        "The first dimension of the input box_encoding tensor should be equal to %d.",
+                                        kNumCoordBox);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        input_class_score->dimension(0) != (info.num_classes() + 1),
+        "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3,
+                                    "The anchors input tensor shape should be [4, N, kBatchSize].");
+    if (input_anchors->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox,
+                                            "The first dimension of the input anchors tensor should be equal to %d.",
+                                            kNumCoordBox);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1))
-                                    || (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) ||
+                                        (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
                                     "The second dimension of the inputs should be the same.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1,
+                                    "The num_detection output tensor shape should be [M].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f),
+                                    "The intersection over union should be positive and less than 1.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0,
+                                    "The number of max classes per detection should be positive.");
 
     const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
     // Validate configured outputs
-    if(output_boxes->total_size() != 0)
+    if (output_boxes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(),
+                                                           TensorShape(4U, num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32);
     }
-    if(output_classes->total_size() != 0)
+    if (output_classes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32);
     }
-    if(output_scores->total_size() != 0)
+    if (output_scores->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32);
     }
-    if(num_detection->total_size() != 0)
+    if (num_detection->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32);
@@ -90,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn
     return Status{};
 }
 
-inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
+inline void
+DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
 {
     const float half_factor = 0.5f;
 
     // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
     const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
     const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
-    const float half_h   = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
-    const float half_w   = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
+    const float half_h =
+        half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
+    const float half_w =
+        half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
 
     // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
     auto decoded_ptr   = reinterpret_cast<float *>(decoded_it.ptr());
@@ -115,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode
  * @param[in]  info               The detection informations
  * @param[out] decoded_boxes      The decoded bboxes.
  */
-void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes)
+void DecodeCenterSizeBoxes(const ITensor                *input_box_encoding,
+                           const ITensor                *input_anchors,
+                           DetectionPostProcessLayerInfo info,
+                           Tensor                       *decoded_boxes)
 {
     const QuantizationInfo &qi_box     = input_box_encoding->info()->quantization_info();
     const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
-    BBox                    box_centersize{ {} };
-    BBox                    anchor{ {} };
+    BBox                    box_centersize{{}};
+    BBox                    anchor{{}};
 
     Window win;
     win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
@@ -130,103 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp
     Iterator anchor_it(input_anchors, win);
     Iterator decoded_it(decoded_boxes, win);
 
-    if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
+    if (input_box_encoding->info()->data_type() == DataType::QASYMM8)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
+                box_centersize =
+                    BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
+                          dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)});
+                anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors),
+                               dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
+                               dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors),
+                               dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
-    else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+    else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
+                box_centersize        = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box),
+                                              dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
+                                              dequantize_qasymm8_signed(*(2 + box_ptr), qi_box),
+                                              dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)});
+                anchor                = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors),
+                                              dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
+                                              dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors),
+                                              dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
-            box_centersize        = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
-            anchor                = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
+                box_centersize        = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)});
+                anchor                = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
 }
 
-void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
-                 std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
-                 ITensor *num_detection)
+void SaveOutputs(const Tensor              *decoded_boxes,
+                 const std::vector<int>    &result_idx_boxes_after_nms,
+                 const std::vector<float>  &result_scores_after_nms,
+                 const std::vector<int>    &result_classes_after_nms,
+                 std::vector<unsigned int> &sorted_indices,
+                 const unsigned int         num_output,
+                 const unsigned int         max_detections,
+                 ITensor                   *output_boxes,
+                 ITensor                   *output_classes,
+                 ITensor                   *output_scores,
+                 ITensor                   *num_detection)
 {
     // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
     unsigned int i = 0;
-    for(; i < num_output; ++i)
+    for (; i < num_output; ++i)
     {
         const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]];
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = result_scores_after_nms[sorted_indices[i]];
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) =
+            static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) =
+            result_scores_after_nms[sorted_indices[i]];
     }
-    for(; i < max_detections; ++i)
+    for (; i < max_detections; ++i)
     {
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))   = 0.0f;
     }
     *(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output;
 }
 } // namespace
 
 CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
-      _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(),
-      _selected_indices(), _class_scores(), _input_scores_to_use(nullptr)
+    : _memory_group(std::move(memory_manager)),
+      _nms(),
+      _input_box_encoding(nullptr),
+      _input_scores(nullptr),
+      _input_anchors(nullptr),
+      _output_boxes(nullptr),
+      _output_classes(nullptr),
+      _output_scores(nullptr),
+      _num_detection(nullptr),
+      _info(),
+      _num_boxes(),
+      _num_classes_with_background(),
+      _num_max_detected_boxes(),
+      _dequantize_scores(false),
+      _decoded_boxes(),
+      _decoded_scores(),
+      _selected_indices(),
+      _class_scores(),
+      _input_scores_to_use(nullptr)
 {
 }
 
-void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
-                                             ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void CPPDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                             const ITensor                *input_scores,
+                                             const ITensor                *input_anchors,
+                                             ITensor                      *output_boxes,
+                                             ITensor                      *output_classes,
+                                             ITensor                      *output_scores,
+                                             ITensor                      *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
+    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+                           num_detection, info);
+
     _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
-    auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_boxes->info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_classes->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_scores->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
     auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(),
-                                                  num_detection->info(),
-                                                  info, _kBatchSize, _kNumCoordBox));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox));
 
     _input_box_encoding          = input_box_encoding;
     _input_scores                = input_scores;
@@ -238,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _info                        = info;
     _num_boxes                   = input_box_encoding->info()->dimension(1);
     _num_classes_with_background = _input_scores->info()->dimension(0);
-    _dequantize_scores           = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
-
-    auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
+    _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
+
+    auto_init_if_empty(*_decoded_boxes.info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1,
+                                  DataType::F32));
+    auto_init_if_empty(
+        *_decoded_scores.info(),
+        TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize),
+                   1, DataType::F32));
+    auto_init_if_empty(
+        *_selected_indices.info(),
+        TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1,
+                   DataType::S32));
     const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
-    auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
+    auto_init_if_empty(
+        *_class_scores.info(),
+        TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1,
+                   DataType::F32));
 
     _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores;
 
@@ -253,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _memory_group.manage(&_decoded_scores);
     _memory_group.manage(&_selected_indices);
     _memory_group.manage(&_class_scores);
-    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold());
+    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices,
+                   info.use_regular_nms() ? info.detection_per_class() : info.max_detections(),
+                   info.nms_score_threshold(), info.iou_threshold());
 
     // Allocate and reserve intermediate tensors and vectors
     _decoded_boxes.allocator()->allocate();
@@ -262,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _class_scores.allocator()->allocate();
 }
 
-Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                                              ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status CPPDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                              const ITensorInfo            *input_class_score,
+                                              const ITensorInfo            *input_anchors,
+                                              ITensorInfo                  *output_boxes,
+                                              ITensorInfo                  *output_classes,
+                                              ITensorInfo                  *output_scores,
+                                              ITensorInfo                  *num_detection,
+                                              DetectionPostProcessLayerInfo info)
 {
-    constexpr unsigned int kBatchSize             = 1;
-    constexpr unsigned int kNumCoordBox           = 4;
-    const TensorInfo       _decoded_boxes_info    = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _decoded_scores_info   = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(),
-                                                                   info.iou_threshold()));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox));
+    constexpr unsigned int kBatchSize   = 1;
+    constexpr unsigned int kNumCoordBox = 4;
+    const TensorInfo       _decoded_boxes_info =
+        TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info,
+                                                                   &_selected_indices_info, info.max_detections(),
+                                                                   info.nms_score_threshold(), info.iou_threshold()));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes,
+                                                   output_classes, output_scores, num_detection, info, kBatchSize,
+                                                   kNumCoordBox));
 
     return Status{};
 }
@@ -286,62 +393,69 @@ void CPPDetectionPostProcessLayer::run()
     DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes);
 
     // Decode scores if necessary
-    if(_dequantize_scores)
+    if (_dequantize_scores)
     {
-        if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
+        if (_input_box_encoding->info()->data_type() == DataType::QASYMM8)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8(
+                            *(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                            _input_scores->info()->quantization_info());
                 }
             }
         }
-        else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+        else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(
+                                                      _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                                                  _input_scores->info()->quantization_info());
                 }
             }
         }
     }
 
     // Regular NMS
-    if(_info.use_regular_nms())
+    if (_info.use_regular_nms())
     {
         std::vector<int>          result_idx_boxes_after_nms;
         std::vector<int>          result_classes_after_nms;
         std::vector<float>        result_scores_after_nms;
         std::vector<unsigned int> sorted_indices;
 
-        for(unsigned int c = 0; c < num_classes; ++c)
+        for (unsigned int c = 0; c < num_classes; ++c)
         {
             // For each boxes get scores of the boxes for the class c
-            for(unsigned int i = 0; i < _num_boxes; ++i)
+            for (unsigned int i = 0; i < _num_boxes; ++i)
             {
                 *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
-                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(
+                        Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
             }
 
             // Run Non-maxima Suppression
             _nms.run();
 
-            for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
+            for (unsigned int i = 0; i < _info.detection_per_class(); ++i)
             {
-                const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
-                if(selected_index == -1)
+                const auto selected_index =
+                    *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
+                if (selected_index == -1)
                 {
                     // Nms will return -1 for all the last M-elements not valid
                     break;
                 }
                 result_idx_boxes_after_nms.emplace_back(selected_index);
-                result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
+                result_scores_after_nms.emplace_back(
+                    (reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
                 result_classes_after_nms.emplace_back(c);
             }
         }
@@ -353,49 +467,46 @@ void CPPDetectionPostProcessLayer::run()
         // Sort selected indices based on result scores
         sorted_indices.resize(num_selected);
         std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
-        std::partial_sort(sorted_indices.data(),
-                          sorted_indices.data() + num_output,
+        std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output,
                           sorted_indices.data() + num_selected,
                           [&](unsigned int first, unsigned int second)
-        {
-
-            return result_scores_after_nms[first] > result_scores_after_nms[second];
-        });
+                          { return result_scores_after_nms[first] > result_scores_after_nms[second]; });
 
-        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms,
+                    sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores,
+                    _num_detection);
     }
     // Fast NMS
     else
     {
-        const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
+        const unsigned int num_classes_per_box =
+            std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
         std::vector<float> max_scores;
         std::vector<int>   box_indices;
         std::vector<int>   max_score_classes;
 
-        for(unsigned int b = 0; b < _num_boxes; ++b)
+        for (unsigned int b = 0; b < _num_boxes; ++b)
         {
             std::vector<float> box_scores;
-            for(unsigned int c = 0; c < num_classes; ++c)
+            for (unsigned int c = 0; c < num_classes; ++c)
             {
-                box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
+                box_scores.emplace_back(
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
             }
 
             std::vector<unsigned int> max_score_indices;
             max_score_indices.resize(_info.num_classes());
             std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
-            std::partial_sort(max_score_indices.data(),
-                              max_score_indices.data() + num_classes_per_box,
+            std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box,
                               max_score_indices.data() + num_classes,
                               [&](unsigned int first, unsigned int second)
-            {
-                return box_scores[first] > box_scores[second];
-            });
+                              { return box_scores[first] > box_scores[second]; });
 
-            for(unsigned int i = 0; i < num_classes_per_box; ++i)
+            for (unsigned int i = 0; i < num_classes_per_box; ++i)
             {
-                const float score_to_add                                                                             = box_scores[max_score_indices[i]];
-                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
+                const float score_to_add = box_scores[max_score_indices[i]];
+                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) =
+                    score_to_add;
                 max_scores.emplace_back(score_to_add);
                 box_indices.emplace_back(b);
                 max_score_classes.emplace_back(max_score_indices[i]);
@@ -405,10 +516,10 @@ void CPPDetectionPostProcessLayer::run()
         // Run Non-maxima Suppression
         _nms.run();
         std::vector<unsigned int> selected_indices;
-        for(unsigned int i = 0; i < max_detections; ++i)
+        for (unsigned int i = 0; i < max_detections; ++i)
         {
             // NMS returns M valid indices, the not valid tail is filled with -1
-            if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
+            if (*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
             {
                 // Nms will return -1 for all the last M-elements not valid
                 break;
@@ -418,8 +529,8 @@ void CPPDetectionPostProcessLayer::run()
         // We select the max detection numbers of the highest score of all classes
         const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
 
-        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output,
+                    max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index 8856191996..3217742c6b 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,33 @@
 #include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
 
 #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
 
 namespace arm_compute
 {
-void CPPNonMaximumSuppression::configure(
-    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+void CPPNonMaximumSuppression::configure(const ITensor *bboxes,
+                                         const ITensor *scores,
+                                         ITensor       *indices,
+                                         unsigned int   max_output_size,
+                                         const float    score_threshold,
+                                         const float    nms_threshold)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CPPNonMaximumSuppressionKernel>();
+    ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+
+    auto k = std::make_unique<CPPNonMaximumSuppressionKernel>();
     k->configure(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
     _kernel = std::move(k);
 }
 
-Status CPPNonMaximumSuppression::validate(
-    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes,
+                                          const ITensorInfo *scores,
+                                          const ITensorInfo *indices,
+                                          unsigned int       max_output_size,
+                                          const float        score_threshold,
+                                          const float        nms_threshold)
 {
-    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold,
+                                                    nms_threshold);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPPermute.cpp b/src/runtime/CPP/functions/CPPPermute.cpp
index 1cdfe92db2..83941f1dc1 100644
--- a/src/runtime/CPP/functions/CPPPermute.cpp
+++ b/src/runtime/CPP/functions/CPPPermute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,16 @@
 #include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
 
 using namespace arm_compute;
 
 void CPPPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CPPPermuteKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, perm);
+
+    auto k = std::make_unique<CPPPermuteKernel>();
     k->configure(input, output, perm);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index eb0d560bdf..3d64def804 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,24 @@
 #include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
 
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
 
 namespace arm_compute
 {
 void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
 {
-    auto kernel = arm_compute::support::cpp14::make_unique<CPPTopKVKernel>();
+    ARM_COMPUTE_LOG_PARAMS(predictions, targets, output, k);
+
+    auto kernel = std::make_unique<CPPTopKVKernel>();
     kernel->configure(predictions, targets, output, k);
     _kernel = std::move(kernel);
 }
 
-Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKV::validate(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     return CPPTopKVKernel::validate(predictions, targets, output, k);
 }
diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
index a154b5ee66..8f72473aeb 100644
--- a/src/runtime/CPP/functions/CPPUpsample.cpp
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,16 @@
 #include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
 
 using namespace arm_compute;
 
 void CPPUpsample::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, info);
+
+    auto k = std::make_unique<CPPUpsampleKernel>();
     k->configure(input, output, info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
deleted file mode 100644
index d8f01a9066..0000000000
--- a/src/runtime/CPUUtils.cpp
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CPUUtils.h"
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Error.h"
-#include "support/StringSupport.h"
-
-#include <algorithm>
-#include <array>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <map>
-
-#ifndef BARE_METAL
-/* C++ std::regex takes up a lot of space in the standalone builds */
-#include <regex.h>
-#include <thread>
-#endif /* BARE_METAL */
-
-#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-#include <sys/auxv.h>
-
-/* Get HWCAP bits from asm/hwcap.h */
-#include <asm/hwcap.h>
-#endif /* !BARE_METAL */
-
-/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
- * out of date (or for bare metal mode) */
-#ifndef HWCAP_ASIMDHP
-#define HWCAP_ASIMDHP (1 << 10) // NOLINT
-#endif                          /* HWCAP_ASIMDHP */
-
-#ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11) // NOLINT
-#endif                        /* HWCAP_CPUID */
-
-#ifndef HWCAP_ASIMDDP
-#define HWCAP_ASIMDDP (1 << 20) // NOLINT
-#endif                          /* HWCAP_ASIMDDP */
-
-namespace
-{
-using namespace arm_compute;
-
-#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-
-bool model_supports_dot(CPUModel model)
-{
-    switch(model)
-    {
-        case CPUModel::GENERIC_FP16_DOT:
-        case CPUModel::A55r1:
-            return true;
-        default:
-            return false;
-    }
-}
-
-bool model_supports_fp16(CPUModel model)
-{
-    switch(model)
-    {
-        case CPUModel::GENERIC_FP16:
-        case CPUModel::GENERIC_FP16_DOT:
-        case CPUModel::A55r1:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/* Convert an MIDR register value to a CPUModel enum value. */
-CPUModel midr_to_model(const unsigned int midr)
-{
-    CPUModel model = CPUModel::GENERIC;
-
-    // Unpack variant and CPU ID
-    const int implementer = (midr >> 24) & 0xFF;
-    const int variant     = (midr >> 20) & 0xF;
-    const int cpunum      = (midr >> 4) & 0xFFF;
-
-    if(implementer == 0x41) // Arm CPUs
-    {
-        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
-        switch(cpunum)
-        {
-            case 0xd03: // A53
-            case 0xd04: // A35
-                model = CPUModel::A53;
-                break;
-            case 0xd05: // A55
-                if(variant != 0)
-                {
-                    model = CPUModel::A55r1;
-                }
-                else
-                {
-                    model = CPUModel::A55r0;
-                }
-                break;
-            case 0xd0a: // A75
-                if(variant != 0)
-                {
-                    model = CPUModel::GENERIC_FP16_DOT;
-                }
-                else
-                {
-                    model = CPUModel::GENERIC_FP16;
-                }
-                break;
-            case 0xd0b: // A76
-            case 0xd06:
-            case 0xd0c:
-            case 0xd0d:
-                model = CPUModel::GENERIC_FP16_DOT;
-                break;
-            default:
-                model = CPUModel::GENERIC;
-                break;
-        }
-    }
-    else if(implementer == 0x48)
-    {
-        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
-        switch(cpunum)
-        {
-            case 0xd40: // A76
-                model = CPUModel::GENERIC_FP16_DOT;
-                break;
-            default:
-                model = CPUModel::GENERIC;
-                break;
-        }
-    }
-
-    return model;
-}
-
-void populate_models_cpuid(std::vector<CPUModel> &cpusv)
-{
-    // If the CPUID capability is present, MIDR information is provided in /sys. Use that to populate the CPU model table.
-    uint32_t i = 0;
-    for(auto &c : cpusv)
-    {
-        std::stringstream str;
-        str << "/sys/devices/system/cpu/cpu" << i++ << "/regs/identification/midr_el1";
-        std::ifstream file;
-        file.open(str.str(), std::ios::in);
-        if(file.is_open())
-        {
-            std::string line;
-            if(bool(getline(file, line)))
-            {
-                const uint32_t midr = support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16);
-                c                   = midr_to_model(midr & 0xffffffff);
-            }
-        }
-    }
-}
-
-void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
-{
-    regex_t proc_regex;
-    regex_t imp_regex;
-    regex_t var_regex;
-    regex_t part_regex;
-    regex_t rev_regex;
-
-    memset(&proc_regex, 0, sizeof(regex_t));
-    memset(&imp_regex, 0, sizeof(regex_t));
-    memset(&var_regex, 0, sizeof(regex_t));
-    memset(&part_regex, 0, sizeof(regex_t));
-    memset(&rev_regex, 0, sizeof(regex_t));
-
-    int ret_status = 0;
-    // If "long-form" cpuinfo is present, parse that to populate models.
-    ret_status |= regcomp(&proc_regex, R"(^processor.*([[:digit:]]+)$)", REG_EXTENDED);
-    ret_status |= regcomp(&imp_regex, R"(^CPU implementer.*0x(..)$)", REG_EXTENDED);
-    ret_status |= regcomp(&var_regex, R"(^CPU variant.*0x(.)$)", REG_EXTENDED);
-    ret_status |= regcomp(&part_regex, R"(^CPU part.*0x(...)$)", REG_EXTENDED);
-    ret_status |= regcomp(&rev_regex, R"(^CPU revision.*([[:digit:]]+)$)", REG_EXTENDED);
-    ARM_COMPUTE_UNUSED(ret_status);
-    ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
-
-    std::ifstream file;
-    file.open("/proc/cpuinfo", std::ios::in);
-
-    if(file.is_open())
-    {
-        std::string line;
-        int         midr   = 0;
-        int         curcpu = -1;
-
-        while(bool(getline(file, line)))
-        {
-            std::array<regmatch_t, 2> match;
-            ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
-            {
-                std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
-                int         newcpu = support::cpp11::stoi(id, nullptr);
-
-                if(curcpu >= 0 && midr == 0)
-                {
-                    // Matched a new CPU ID without any description of the previous one - looks like old format.
-                    return;
-                }
-
-                if(curcpu >= 0)
-                {
-                    cpusv[curcpu] = midr_to_model(midr);
-                }
-
-                midr   = 0;
-                curcpu = newcpu;
-
-                continue;
-            }
-
-            ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
-            {
-                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
-                int         impv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
-                midr |= (impv << 24);
-
-                continue;
-            }
-
-            ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
-            {
-                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
-                int         varv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
-                midr |= (varv << 20);
-
-                continue;
-            }
-
-            ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
-            {
-                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
-                int         partv  = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
-                midr |= (partv << 4);
-
-                continue;
-            }
-
-            ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
-            {
-                std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
-                int         regv   = support::cpp11::stoi(subexp, nullptr);
-                midr |= (regv);
-                midr |= (0xf << 16);
-
-                continue;
-            }
-        }
-
-        if(curcpu >= 0)
-        {
-            cpusv[curcpu] = midr_to_model(midr);
-        }
-    }
-
-    // Free allocated memory
-    regfree(&proc_regex);
-    regfree(&imp_regex);
-    regfree(&var_regex);
-    regfree(&part_regex);
-    regfree(&rev_regex);
-}
-
-int get_max_cpus()
-{
-    int           max_cpus = 1;
-    std::ifstream CPUspresent;
-    CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
-    bool success = false;
-
-    if(CPUspresent.is_open())
-    {
-        std::string line;
-
-        if(bool(getline(CPUspresent, line)))
-        {
-            /* The content of this file is a list of ranges or single values, e.g.
-                 * 0-5, or 1-3,5,7 or similar.  As we are interested in the
-                 * max valid ID, we just need to find the last valid
-                 * delimiter ('-' or ',') and parse the integer immediately after that.
-                 */
-            auto startfrom = line.begin();
-
-            for(auto i = line.begin(); i < line.end(); ++i)
-            {
-                if(*i == '-' || *i == ',')
-                {
-                    startfrom = i + 1;
-                }
-            }
-
-            line.erase(line.begin(), startfrom);
-
-            max_cpus = support::cpp11::stoi(line, nullptr) + 1;
-            success  = true;
-        }
-    }
-
-    // Return std::thread::hardware_concurrency() as a fallback.
-    if(!success)
-    {
-        max_cpus = std::thread::hardware_concurrency();
-    }
-    return max_cpus;
-}
-#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
-
-} // namespace
-
-namespace arm_compute
-{
-void get_cpu_configuration(CPUInfo &cpuinfo)
-{
-#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
-    bool cpuid               = false;
-    bool hwcaps_fp16_support = false;
-    bool hwcaps_dot_support  = false;
-
-    const uint32_t hwcaps = getauxval(AT_HWCAP);
-
-    if((hwcaps & HWCAP_CPUID) != 0)
-    {
-        cpuid = true;
-    }
-
-    if((hwcaps & HWCAP_ASIMDHP) != 0)
-    {
-        hwcaps_fp16_support = true;
-    }
-
-#if defined(__aarch64__)
-    if((hwcaps & HWCAP_ASIMDDP) != 0)
-    {
-        hwcaps_dot_support = true;
-    }
-#endif /* defined(__aarch64__) */
-
-    const unsigned int max_cpus = get_max_cpus();
-    cpuinfo.set_cpu_num(max_cpus);
-    std::vector<CPUModel> percpu(max_cpus, CPUModel::GENERIC);
-    if(cpuid)
-    {
-        populate_models_cpuid(percpu);
-    }
-    else
-    {
-        populate_models_cpuinfo(percpu);
-    }
-    int j(0);
-    // Update dot product and FP16 support if one of the CPUs support these features
-    // We assume that the system does not have mixed architectures
-    bool one_supports_dot  = false;
-    bool one_supports_fp16 = false;
-    for(const auto &v : percpu)
-    {
-        one_supports_dot  = one_supports_dot || model_supports_dot(v);
-        one_supports_fp16 = one_supports_fp16 || model_supports_fp16(v);
-        cpuinfo.set_cpu_model(j++, v);
-    }
-    cpuinfo.set_dotprod(one_supports_dot || hwcaps_dot_support);
-    cpuinfo.set_fp16(one_supports_fp16 || hwcaps_fp16_support);
-#else  /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
-    ARM_COMPUTE_UNUSED(cpuinfo);
-#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
-}
-
-unsigned int get_threads_hint()
-{
-    unsigned int num_threads_hint = 1;
-
-#if !defined(BARE_METAL)
-    std::map<std::string, unsigned int> cpu_part_occurrence_map;
-
-    // CPU part regex
-    regex_t cpu_part_rgx;
-    memset(&cpu_part_rgx, 0, sizeof(regex_t));
-    int ret_status = regcomp(&cpu_part_rgx, R"(.*CPU part.+/?\:[[:space:]]+([[:alnum:]]+).*)", REG_EXTENDED);
-    ARM_COMPUTE_UNUSED(ret_status);
-    ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
-
-    // Read cpuinfo and get occurrence of each core
-    std::ifstream cpuinfo;
-    cpuinfo.open("/proc/cpuinfo", std::ios::in);
-    if(cpuinfo.is_open())
-    {
-        std::string line;
-        while(bool(getline(cpuinfo, line)))
-        {
-            std::array<regmatch_t, 2> match;
-            ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
-            {
-                std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
-                if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
-                {
-                    cpu_part_occurrence_map[cpu_part]++;
-                }
-                else
-                {
-                    cpu_part_occurrence_map[cpu_part] = 1;
-                }
-            }
-        }
-    }
-    regfree(&cpu_part_rgx);
-
-    // Get min number of threads
-    auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
-                                             [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
-    {
-        return p1.second < p2.second;
-    });
-
-    // Set thread hint
-    num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
-#endif /* !defined(BARE_METAL) */
-
-    return num_threads_hint;
-}
-
-} // namespace arm_compute
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
deleted file mode 100644
index 7080220320..0000000000
--- a/src/runtime/Distribution1D.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/Distribution1D.h"
-
-#include "arm_compute/core/Error.h"
-
-#include <cstdint>
-
-using namespace arm_compute;
-
-Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
-    : IDistribution1D(num_bins, offset, range), _data(num_bins)
-{
-}
-
-uint32_t *Distribution1D::buffer() const
-{
-    return _data.data();
-}
diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
deleted file mode 100644
index 70a1f4f8ff..0000000000
--- a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-void *GCBufferAllocator::allocate(size_t size, size_t alignment)
-{
-    ARM_COMPUTE_UNUSED(alignment);
-
-    auto *gl_ssbo_name = new GLuint;
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, *gl_ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
-
-    return reinterpret_cast<void *>(gl_ssbo_name);
-}
-
-void GCBufferAllocator::free(void *ptr)
-{
-    ARM_COMPUTE_ERROR_ON(ptr == nullptr);
-    auto *gl_ssbo_name = reinterpret_cast<GLuint *>(ptr);
-    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, gl_ssbo_name));
-    delete gl_ssbo_name;
-}
-
-std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment)
-{
-    ARM_COMPUTE_UNUSED(alignment);
-    return arm_compute::support::cpp14::make_unique<GCBufferMemoryRegion>(size);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCHelpers.cpp b/src/runtime/GLES_COMPUTE/GCHelpers.cpp
deleted file mode 100644
index df2f4f5e6e..0000000000
--- a/src/runtime/GLES_COMPUTE/GCHelpers.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCHelpers.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-std::tuple<EGLDisplay, EGLContext, EGLBoolean> create_opengl_display_and_context()
-{
-    EGLBoolean res;
-    EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
-
-    res = eglInitialize(display, nullptr, nullptr);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    const char *egl_extension_st = eglQueryString(display, EGL_EXTENSIONS);
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
-    ARM_COMPUTE_UNUSED(egl_extension_st);
-
-    const std::array<EGLint, 3> config_attribs =
-    {
-        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
-        EGL_NONE
-    };
-    EGLConfig cfg;
-    EGLint    count;
-
-    res = eglChooseConfig(display, config_attribs.data(), &cfg, 1, &count);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglBindAPI(EGL_OPENGL_ES_API);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
-
-    const std::array<EGLint, 3> attribs =
-    {
-        EGL_CONTEXT_CLIENT_VERSION, 3,
-        EGL_NONE
-    };
-    EGLContext context = eglCreateContext(display,
-                                          cfg,
-                                          EGL_NO_CONTEXT,
-                                          attribs.data());
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, context);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    return std::make_tuple(display, context, res);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
deleted file mode 100644
index f1457c4d6e..0000000000
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
-
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-
-namespace arm_compute
-{
-GCMemory::GCMemory()
-    : _region(nullptr), _region_owned(nullptr)
-{
-}
-
-GCMemory::GCMemory(const std::shared_ptr<IGCMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
-{
-    _region_owned = memory;
-    _region       = _region_owned.get();
-}
-
-GCMemory::GCMemory(IGCMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
-{
-    _region = memory;
-}
-
-IGCMemoryRegion *GCMemory::gc_region()
-{
-    return _region;
-}
-
-IGCMemoryRegion *GCMemory::gc_region() const
-{
-    return _region;
-}
-
-IMemoryRegion *GCMemory::region()
-{
-    return _region;
-}
-
-IMemoryRegion *GCMemory::region() const
-{
-    return _region;
-}
-
-void GCMemory::set_region(IMemoryRegion *region)
-{
-    auto gc_region = utils::cast::polymorphic_downcast<IGCMemoryRegion *>(region);
-    _region_owned  = nullptr;
-    _region        = gc_region;
-}
-
-void GCMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
-{
-    _region_owned = utils::cast::polymorphic_downcast_unique_ptr<IGCMemoryRegion>(std::move(region));
-    _region       = _region_owned.get();
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
deleted file mode 100644
index 2ffd9f2ffc..0000000000
--- a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-IGCMemoryRegion::IGCMemoryRegion(size_t size)
-    : IMemoryRegion(size), _mapping(nullptr), _ssbo_name(0)
-{
-}
-
-const GLuint &IGCMemoryRegion::gc_ssbo_name() const
-{
-    return _ssbo_name;
-}
-
-void *IGCMemoryRegion::buffer()
-{
-    return _mapping;
-}
-
-const void *IGCMemoryRegion::buffer() const
-{
-    return _mapping;
-}
-
-GCBufferMemoryRegion::GCBufferMemoryRegion(size_t size)
-    : IGCMemoryRegion(size)
-{
-    ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
-}
-
-GCBufferMemoryRegion::~GCBufferMemoryRegion()
-{
-    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_ssbo_name));
-}
-
-void *GCBufferMemoryRegion::ptr()
-{
-    return nullptr;
-}
-
-void *GCBufferMemoryRegion::map(bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    ARM_COMPUTE_UNUSED(blocking);
-
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
-    void *p  = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT));
-    _mapping = reinterpret_cast<uint8_t *>(p);
-
-    return _mapping;
-}
-
-void GCBufferMemoryRegion::unmap()
-{
-    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
-
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
-    _mapping = nullptr;
-}
-
-std::unique_ptr<IMemoryRegion> GCBufferMemoryRegion::extract_subregion(size_t offset, size_t size)
-{
-    ARM_COMPUTE_UNUSED(offset, size);
-    return nullptr;
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
deleted file mode 100644
index a45d7931be..0000000000
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-
-using namespace arm_compute;
-
-std::once_flag GCScheduler::_initialize_symbols;
-
-GCScheduler::GCScheduler()
-    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _target(GPUTarget::MIDGARD)
-{
-}
-
-GCScheduler::~GCScheduler()
-{
-    eglDestroyContext(_display, _context);
-    eglTerminate(_display);
-
-    _context = EGL_NO_CONTEXT;
-    _display = EGL_NO_DISPLAY;
-}
-
-void GCScheduler::default_init()
-{
-    setup_context();
-
-    init(_display, _context);
-}
-
-void GCScheduler::default_init_with_context(EGLDisplay display, EGLContext ctx)
-{
-    _context = ctx;
-    _display = display;
-
-    _target = get_target_from_device();
-}
-
-void GCScheduler::init(EGLDisplay dpy, EGLContext ctx)
-{
-    _target = get_target_from_device();
-
-    GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx);
-}
-
-GCScheduler &GCScheduler::get()
-{
-    std::call_once(_initialize_symbols, opengles31_is_available);
-    static GCScheduler scheduler;
-    return scheduler;
-}
-
-void GCScheduler::dispatch(IGCKernel &kernel, bool flush)
-{
-    kernel.run(kernel.window());
-    if(flush)
-    {
-        ARM_COMPUTE_GL_CHECK(glFlush());
-    }
-}
-
-void GCScheduler::memory_barrier()
-{
-    ARM_COMPUTE_GL_CHECK(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT));
-}
-
-void GCScheduler::setup_context()
-{
-    EGLBoolean res;
-    _display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
-
-    res = eglInitialize(_display, nullptr, nullptr);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS);
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
-    ARM_COMPUTE_UNUSED(egl_extension_st);
-
-    const std::array<EGLint, 3> config_attribs =
-    {
-        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
-        EGL_NONE
-    };
-    EGLConfig cfg;
-    EGLint    count;
-
-    res = eglChooseConfig(_display, config_attribs.data(), &cfg, 1, &count);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglBindAPI(EGL_OPENGL_ES_API);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
-
-    const std::array<EGLint, 3> attribs =
-    {
-        EGL_CONTEXT_CLIENT_VERSION, 3,
-        EGL_NONE
-    };
-    _context = eglCreateContext(_display,
-                                cfg,
-                                EGL_NO_CONTEXT,
-                                attribs.data());
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-}
diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
deleted file mode 100644
index 61523bcc31..0000000000
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-GCTensorAllocator::GCTensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory(), _mapping(nullptr)
-{
-}
-
-uint8_t *GCTensorAllocator::data()
-{
-    return _mapping;
-}
-
-void GCTensorAllocator::allocate()
-{
-    if(_associated_memory_group == nullptr)
-    {
-        _memory.set_owned_region(support::cpp14::make_unique<GCBufferMemoryRegion>(info().total_size()));
-    }
-    else
-    {
-        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
-    }
-    info().set_is_resizable(false);
-}
-
-void GCTensorAllocator::free()
-{
-    _mapping = nullptr;
-    _memory.set_region(nullptr);
-    info().set_is_resizable(true);
-}
-
-void GCTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_memory_group)
-{
-    ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
-    ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr && _associated_memory_group != associated_memory_group);
-    ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.gc_region()->gc_ssbo_name() != 0);
-
-    _associated_memory_group = associated_memory_group;
-}
-
-uint8_t *GCTensorAllocator::lock()
-{
-    return map(true);
-}
-
-void GCTensorAllocator::unlock()
-{
-    unmap();
-}
-
-GLuint GCTensorAllocator::get_gl_ssbo_name() const
-{
-    return (_memory.region() == nullptr) ? static_cast<GLuint>(0) : _memory.gc_region()->gc_ssbo_name();
-}
-
-uint8_t *GCTensorAllocator::map(bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-
-    _mapping = reinterpret_cast<uint8_t *>(_memory.gc_region()->map(blocking));
-    return _mapping;
-}
-
-void GCTensorAllocator::unmap()
-{
-    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-
-    _memory.gc_region()->unmap();
-    _mapping = nullptr;
-}
diff --git a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
deleted file mode 100644
index bb9239eabe..0000000000
--- a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-IGCSimpleFunction::IGCSimpleFunction(GCRuntimeContext *ctx) //NOLINT
-    : _kernel(),
-      _border_handler(),
-      _ctx(ctx)
-{
-}
-
-void IGCSimpleFunction::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the GLES kernel or function isn't configured");
-    GCScheduler *scheduler = (_ctx != nullptr) ? _ctx->gpu_scheduler() : &GCScheduler::get().get();
-    ARM_COMPUTE_ERROR_ON(scheduler == nullptr);
-
-    scheduler->dispatch(_border_handler, false);
-    scheduler->memory_barrier();
-    scheduler->dispatch(*_kernel);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
deleted file mode 100644
index 5098dd786d..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCAbsoluteDifference::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<GCAbsoluteDifferenceKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp b/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp
deleted file mode 100755
index b0d8a3cf9f..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCArithmeticAddition::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<GCArithmeticAdditionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-}
-
-Status GCArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return GCArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
deleted file mode 100755
index cc5e8f49f2..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCBatchNormalizationLayer::GCBatchNormalizationLayer()
-    : _norm_kernel()
-{
-}
-
-void GCBatchNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon,
-                                          ActivationLayerInfo act_info)
-{
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
-}
-
-void GCBatchNormalizationLayer::run()
-{
-    GCScheduler::get().dispatch(_norm_kernel, true);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
deleted file mode 100644
index 81e98f1ba2..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/MemorySupport.h"
-
-namespace arm_compute
-{
-GCConcatenateLayer::GCConcatenateLayer()
-    : _concat_kernels(),
-      _num_inputs(0),
-      _axis(Window::DimZ)
-{
-}
-
-void GCConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output, size_t axis)
-{
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
-    _num_inputs = inputs_vector.size();
-    _axis       = axis;
-
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-
-    unsigned int offset = 0;
-    switch(axis)
-    {
-        case Window::DimZ:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = support::cpp14::make_unique<GCDepthConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-}
-
-void GCConcatenateLayer::run()
-{
-    for(auto &kernel : _concat_kernels)
-    {
-        GCScheduler::get().dispatch(*kernel, true);
-    }
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
deleted file mode 100644
index 61c0740937..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
-GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel()
-{
-}
-
-void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(weights->info()->data_type()));
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr;
-
-    _weights_reshape_kernel.configure(weights, biases_to_use, output);
-}
-
-void GCConvolutionLayerReshapeWeights::run()
-{
-    GCScheduler::get().dispatch(_weights_reshape_kernel);
-}
-
-GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
-      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_activationlayer_enabled(false), _is_prepared(false)
-{
-}
-
-void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info()));
-
-    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
-}
-
-Status GCConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output)
-{
-    // Perform validation step on Matrix multiply function
-    GCGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
-    return Status{};
-}
-
-void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON(num_groups > 1);
-    ARM_COMPUTE_UNUSED(num_groups);
-
-    _is_prepared      = false;
-    _original_weights = weights;
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    const DataType dt = input->info()->data_type();
-
-    // Set the GPU target for im2col and col2im
-    _input_im2col_kernel.set_target(GCScheduler::get().get_target());
-    _output_col2im_kernel.set_target(GCScheduler::get().get_target());
-
-    const bool       append_bias   = (biases != nullptr);
-    const unsigned   bias_element  = (append_bias) ? 1 : 0;
-    const IGCTensor *biases_to_use = (append_bias) ? biases : nullptr;
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    const unsigned int kernel_width  = weights->info()->dimension(0);
-    const unsigned int kernel_height = weights->info()->dimension(1);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info, dilation);
-
-    unsigned int mat_weights_cols = weights->info()->dimension(3);
-    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
-
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in GCGEMM
-    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
-
-    weights = &_weights_reshaped;
-
-    // Create tensor to store im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->info()->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-
-    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt);
-    _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
-    _memory_group.manage(&_input_im2col_reshaped);
-
-    // Create GEMM output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    const DataType gemm_data_type = dt;
-
-    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
-    _gemm_output.allocator()->init(info_gemm);
-    _memory_group.manage(&_gemm_output);
-
-    if(dt == DataType::F16)
-    {
-        BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-        input->info()->extend_padding(border_size);
-        _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue()); // for PAD of im2col fp16: consider it as border
-    }
-    // Configure im2col
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
-
-    // Configure GEMM
-    configure_mm(&_input_im2col_reshaped, weights, &_gemm_output);
-
-    _input_im2col_reshaped.allocator()->allocate();
-
-    // Configure Col2Im
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
-    _gemm_output.allocator()->allocate();
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-
-    ARM_COMPUTE_UNUSED(weights_info);
-}
-
-void GCConvolutionLayer::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run im2col
-    GCScheduler::get().dispatch(_fill_border);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_input_im2col_kernel);
-
-    // Run gemm on reshaped matrices
-    _mm_gemm.run();
-    GCScheduler::get().memory_barrier();
-
-    // Reshape output matrix
-    GCScheduler::get().dispatch(_output_col2im_kernel, false);
-    GCScheduler::get().memory_barrier();
-
-    // Run Activation Layer
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-}
-
-void GCConvolutionLayer::prepare()
-{
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights reshaping and mark as unused
-        _weights_reshaped.allocator()->allocate();
-        _reshape_weights.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-
-        _is_prepared = true;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
deleted file mode 100644
index 9a9f30d4be..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-GCDepthwiseConvolutionLayer3x3::GCDepthwiseConvolutionLayer3x3()
-    : _kernel(nullptr), _border_handler(), _shift_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
-{
-}
-
-void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
-    ARM_COMPUTE_UNUSED(dilation);
-    auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
-    k->configure(input, weights, biases, output, conv_info, depth_multiplier);
-    _kernel = std::move(k);
-
-    // Configure border handler
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-
-    _shift_handler.configure(input);
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-}
-
-void GCDepthwiseConvolutionLayer3x3::run()
-{
-    GCScheduler::get().dispatch(_shift_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(*_kernel);
-
-    // Run Activation Layer
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
deleted file mode 100644
index 3bc3398cb5..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-GCDirectConvolutionLayer::GCDirectConvolutionLayer()
-    : _kernel(nullptr), _border_handler(), _shift_handler()
-{
-}
-
-void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                                         const ActivationLayerInfo &act_info)
-{
-    int kernel_size = weights->info()->dimension(0);
-
-    if(kernel_size == 1)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer1x1Kernel>();
-        k->configure(input, weights, biases, output, conv_info, act_info);
-        _kernel = std::move(k);
-    }
-    else if(kernel_size == 3)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer3x3Kernel>();
-        k->configure(input, weights, biases, output, conv_info, act_info);
-        _kernel = std::move(k);
-    }
-    else if(kernel_size == 5)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer5x5Kernel>();
-        k->configure(input, weights, biases, output, conv_info, act_info);
-        _kernel = std::move(k);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("kernel size unsupported!");
-        return;
-    }
-
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-
-    _shift_handler.configure(input);
-}
-
-void GCDirectConvolutionLayer::run()
-{
-    GCScheduler::get().dispatch(_shift_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(*_kernel);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_shift_handler);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
deleted file mode 100644
index 6407464e48..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-
-using namespace arm_compute;
-
-GCDropoutLayer::GCDropoutLayer()
-    : _dropout_kernel()
-{
-}
-
-void GCDropoutLayer::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output);
-
-    // Configure kernel
-    _dropout_kernel.configure(input, mask, output, ratio, forward);
-}
-
-void GCDropoutLayer::run()
-{
-    GCScheduler::get().dispatch(_dropout_kernel);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
deleted file mode 100644
index d1d9874449..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCFillBorder::configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<GCFillBorderKernel>();
-    k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
deleted file mode 100644
index d391eddf84..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "support/MemorySupport.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-void GCFullyConnectedLayerReshapeWeights::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<GCTransposeKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(std::move(memory_manager)), _weights_manager(std::move(weights_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(),
-      _reshape_weights_output(), _original_weights(nullptr), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
-{
-}
-
-void GCFullyConnectedLayer::configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-    const DataType dt = input->info()->data_type();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, input->info()->dimension(3));
-    shape_im2col.set(2, input->info()->dimension(4));
-    shape_im2col.set(3, input->info()->dimension(5));
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt));
-
-    // Configure im2col kernel
-    _memory_group.manage(&_im2col_output);
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
-
-    // Allocate the output tensor for im2col once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-}
-
-void GCFullyConnectedLayer::configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(input, weights, output, 1.0f, false);
-}
-
-void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output,
-                                      FullyConnectedLayerInfo fc_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
-
-    _original_weights     = weights;
-    _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    _is_fc_after_conv     = true;
-    _accumulate_biases    = false;
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
-        _accumulate_biases = true;
-
-        // Configure accumulate biases kernel
-        _accumulate_biases_kernel.configure(output, biases);
-    }
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    const IGCTensor *weights_to_use = weights;
-
-    if(!_are_weights_reshaped)
-    {
-        weights_to_use = &_reshape_weights_output;
-
-        // Reshape the weights
-        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
-    }
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-
-    if(is_batched_fc_layer)
-    {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                                  input->info()->tensor_shape().cend(),
-                                                                                  output->info()->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        _is_fc_after_conv = input->info()->num_dimensions() > 1;
-    }
-
-    if(_is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(input, weights_to_use, output);
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(input, weights_to_use, output);
-    }
-
-    ARM_COMPUTE_ERROR_ON(fc_info.retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
-    _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
-}
-
-void GCFullyConnectedLayer::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
-    {
-        GCScheduler::get().dispatch(_im2col_kernel, false);
-    }
-
-    if(!_are_weights_reshaped || _is_fc_after_conv)
-    {
-        GCScheduler::get().memory_barrier();
-    }
-
-    // Run matrix multiply
-    GCScheduler::get().dispatch(_mm_kernel, !_accumulate_biases);
-
-    // Accumulate biases if provided
-    if(_accumulate_biases)
-    {
-        GCScheduler::get().memory_barrier();
-
-        GCScheduler::get().dispatch(_accumulate_biases_kernel);
-    }
-}
-
-void GCFullyConnectedLayer::prepare()
-{
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run reshape weights kernel and mark weights as unused
-        _reshape_weights_output.allocator()->allocate();
-        _reshape_weights_kernel.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-
-        _are_weights_reshaped = true;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
deleted file mode 100644
index ddfe590ee1..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    if(c != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
-        ARM_COMPUTE_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_UNUSED(gemm_info);
-    return Status{};
-}
-} // namespace
-
-GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
-      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
-{
-}
-
-void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = false;
-    _original_b                  = b;
-
-    const IGCTensor *matrix_a = a;
-    const IGCTensor *matrix_b = b;
-
-    // Get the GPU target
-    const GPUTarget gpu_target = GCScheduler::get().get_target();
-
-    // Set the target for the kernels
-    _interleave_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
-
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to GCGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GCGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    const int m                         = a->info()->dimension(1);
-    const int n                         = b->info()->dimension(0);
-    const int k                         = a->info()->dimension(0);
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
-
-    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    _is_interleaved_transposed = a->info()->dimension(1) > 16;
-
-    if(_is_interleaved_transposed)
-    {
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
-
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
-    }
-
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
-
-    if(_is_interleaved_transposed)
-    {
-        // Allocate intermediate tensors
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
-    }
-
-    // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
-    {
-        _ma_kernel.configure(c, output, beta);
-        _run_addition = true;
-    }
-}
-
-Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
-    return Status{};
-}
-
-void GCGEMM::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_is_interleaved_transposed)
-    {
-        // Run interleave kernel
-        GCScheduler::get().dispatch(_interleave_kernel, false);
-
-        if(!_reshape_b_only_on_first_run)
-        {
-            // Run transpose kernel
-            GCScheduler::get().dispatch(_transpose_kernel, false);
-        }
-
-        GCScheduler::get().memory_barrier();
-    }
-
-    // Run matrix multiply kernel
-    GCScheduler::get().dispatch(_mm_kernel, !_run_addition);
-
-    // Run matrix addition kernel
-    if(_run_addition)
-    {
-        GCScheduler::get().memory_barrier();
-        GCScheduler::get().dispatch(_ma_kernel);
-    }
-}
-
-void GCGEMM::prepare()
-{
-    if(!_is_prepared)
-    {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
-            // Run transpose kernel
-            _tmp_b.allocator()->allocate();
-            GCScheduler::get().dispatch(_transpose_kernel, false);
-            GCScheduler::get().memory_barrier();
-
-            // Mark original weights tensor as unused
-            _original_b->mark_as_unused();
-        }
-
-        _is_prepared = true;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
deleted file mode 100644
index cc37bf4b4d..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void GCGEMMInterleave4x4::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<GCGEMMInterleave4x4Kernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
deleted file mode 100644
index af933fa578..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void GCGEMMTranspose1xW::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<GCGEMMTranspose1xWKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
deleted file mode 100644
index 8f602792a8..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCNormalizationLayer::GCNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
-{
-}
-
-void GCNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info)
-{
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-
-    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
-    _memory_group.manage(&_squared_input);
-
-    _norm_kernel.configure(input, &_squared_input, output, norm_info);
-    _multiply_kernel.configure(input, input, &_squared_input, 1.0f);
-    // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
-
-    // Allocate intermediate buffers
-    _squared_input.allocator()->allocate();
-}
-
-void GCNormalizationLayer::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    GCScheduler::get().dispatch(_multiply_kernel, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_norm_kernel, true);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
deleted file mode 100755
index 19fdc3d7c0..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCNormalizePlanarYUVLayer::GCNormalizePlanarYUVLayer()
-    : _norm_kernel()
-{
-}
-
-void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std)
-{
-    _norm_kernel.configure(input, output, mean, std);
-}
-
-Status GCNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *std)
-{
-    return GCNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
-}
-
-void GCNormalizePlanarYUVLayer::run()
-{
-    GCScheduler::get().dispatch(_norm_kernel, true);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
deleted file mode 100755
index 1075f0b5be..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCPixelWiseMultiplication::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<GCPixelWiseMultiplicationKernel>();
-    k->configure(input1, input2, output, scale);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
deleted file mode 100644
index accf60e204..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include "support/MemorySupport.h"
-
-namespace arm_compute
-{
-GCPoolingLayer::GCPoolingLayer()
-    : _kernel(nullptr), _border_handler(), _shift_handler()
-{
-}
-
-void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices)
-{
-    // Configure pooling kernel
-    auto k = arm_compute::support::cpp14::make_unique<GCPoolingLayerKernel>();
-    k->configure(input, output, pool_info, indices);
-    _kernel = std::move(k);
-
-    // Configure border depending on operation required
-    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f));
-
-    _shift_handler.configure(input);
-}
-
-Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    return GCPoolingLayerKernel::validate(input, output, pool_info, indices);
-}
-
-void GCPoolingLayer::run()
-{
-    GCScheduler::get().dispatch(_shift_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(*_kernel);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp b/src/runtime/GLES_COMPUTE/functions/GCScale.cpp
deleted file mode 100644
index f245c3ecd0..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
-#include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void GCScale::configure(IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
-                        bool align_corners)
-{
-    ARM_COMPUTE_UNUSED(use_padding, align_corners);
-    auto k = arm_compute::support::cpp14::make_unique<GCScaleKernel>();
-    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED, sampling_policy);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
deleted file mode 100644
index 0645ae7f8f..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
-{
-}
-
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t axis)
-{
-    ARM_COMPUTE_UNUSED(beta, axis);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(beta != 1.0f);
-    ARM_COMPUTE_ERROR_ON_MSG(axis != 1, "Axis must be 1 for GLES");
-
-    // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
-
-    TensorShape shape = input->info()->tensor_shape();
-    shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
-    _max.allocator()->init(tensor_info_max_sum);
-    _sum.allocator()->init(tensor_info_max_sum);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-    _memory_group.manage(&_max);
-    _memory_group.manage(&_sum);
-
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
-    _norm_kernel.configure(&_tmp, &_sum, output);
-
-    // Allocate intermediate buffers
-    _tmp.allocator()->allocate();
-    _max.allocator()->allocate();
-    _sum.allocator()->allocate();
-}
-
-void GCSoftmaxLayer::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    GCScheduler::get().dispatch(_max_kernel, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_norm_kernel);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
deleted file mode 100644
index 7ef07e923f..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void GCTensorShift::configure(IGCTensor *input)
-{
-    auto k = arm_compute::support::cpp14::make_unique<GCTensorShiftKernel>();
-    k->configure(input);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
deleted file mode 100644
index 530f52abd6..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCTranspose::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<GCTransposeKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
deleted file mode 100644
index d312967699..0000000000
--- a/src/runtime/HOG.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/HOG.h"
-
-#include "arm_compute/core/Error.h"
-
-using namespace arm_compute;
-
-HOG::HOG()
-    : IHOG(), _info(), _descriptor()
-{
-}
-
-void HOG::init(const HOGInfo &input)
-{
-    _info = input;
-    _descriptor.resize(_info.descriptor_size());
-}
-
-float *HOG::descriptor() const
-{
-    return _descriptor.data();
-}
-
-const HOGInfo *HOG::info() const
-{
-    return &_info;
-}
diff --git a/src/runtime/ILutAllocator.cpp b/src/runtime/ILutAllocator.cpp
deleted file mode 100644
index fb961638f1..0000000000
--- a/src/runtime/ILutAllocator.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/ILutAllocator.h"
-
-#include "arm_compute/core/Utils.h"
-
-using namespace arm_compute;
-
-ILutAllocator::ILutAllocator()
-    : _num_elements(0), _data_type(DataType::U8)
-{
-}
-
-void ILutAllocator::init(size_t num_elements, DataType data_type)
-{
-    // Init internal metadata
-    _num_elements = num_elements;
-    _data_type    = data_type;
-
-    // Allocate the image's memory
-    allocate();
-}
-
-size_t ILutAllocator::num_elements() const
-{
-    return _num_elements;
-}
-
-DataType ILutAllocator::type() const
-{
-    return _data_type;
-}
-
-size_t ILutAllocator::size() const
-{
-    return data_size_from_type(_data_type) * num_elements();
-}
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index b2edad0ca5..ecf84abd2c 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,32 +23,198 @@
  */
 #include "arm_compute/runtime/IScheduler.h"
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/cpuinfo/CpuInfo.h"
+#include "src/runtime/SchedulerUtils.h"
 
 namespace arm_compute
 {
 IScheduler::IScheduler()
-    : _cpu_info()
 {
-    get_cpu_configuration(_cpu_info);
     // Work out the best possible number of execution threads
-    _num_threads_hint = get_threads_hint();
+    _num_threads_hint = cpuinfo::num_threads_hint();
 }
 
 CPUInfo &IScheduler::cpu_info()
 {
-    return _cpu_info;
+    return CPUInfo::get();
+}
+
+void IScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func)
+{
+    ARM_COMPUTE_UNUSED(num_threads, func);
+    ARM_COMPUTE_ERROR("Feature for affinity setting is not implemented");
 }
 
 unsigned int IScheduler::num_threads_hint() const
 {
     return _num_threads_hint;
 }
+
+void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+#ifndef BARE_METAL
+    const Window &max_window = window;
+    if (hints.split_dimension() == IScheduler::split_dimensions_all)
+    {
+        /*
+         * if the split dim is size_t max then this signals we should parallelise over
+         * all dimensions
+         */
+        const std::size_t m = max_window.num_iterations(Window::DimX);
+        const std::size_t n = max_window.num_iterations(Window::DimY);
+
+        //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
+        unsigned m_threads, n_threads;
+        std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
+
+        std::vector<IScheduler::Workload> workloads;
+        for (unsigned int ni = 0; ni != n_threads; ++ni)
+        {
+            for (unsigned int mi = 0; mi != m_threads; ++mi)
+            {
+                workloads.push_back(
+                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info)
+                    {
+                        //narrow the window to our mi-ni workload
+                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                         .split_window(Window::DimY, ni, n_threads);
+
+                        win.validate();
+
+                        Window thread_locator;
+                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+
+                        thread_locator.validate();
+
+                        kernel->run_nd(win, info, thread_locator);
+                    });
+            }
+        }
+        run_workloads(workloads);
+    }
+    else
+    {
+        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+        const unsigned int num_threads    = std::min(num_iterations, this->num_threads());
+
+        if (num_iterations == 0)
+        {
+            return;
+        }
+
+        if (!kernel->is_parallelisable() || num_threads == 1)
+        {
+            ThreadInfo info;
+            info.cpu_info = &cpu_info();
+            if (tensors.empty())
+            {
+                kernel->run(max_window, info);
+            }
+            else
+            {
+                kernel->run_op(tensors, max_window, info);
+            }
+        }
+        else
+        {
+            unsigned int num_windows = 0;
+            switch (hints.strategy())
+            {
+                case StrategyHint::STATIC:
+                    num_windows = num_threads;
+                    break;
+                case StrategyHint::DYNAMIC:
+                {
+                    const unsigned int granule_threshold =
+                        (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+                    // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
+                    num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Unknown strategy");
+            }
+            // Make sure the smallest window is larger than minimum workload size
+            num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info());
+
+            std::vector<IScheduler::Workload> workloads(num_windows);
+            for (unsigned int t = 0; t < num_windows; ++t)
+            {
+                //Capture 't' by copy, all the other variables by reference:
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
+                {
+                    Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                    win.validate();
+
+                    if (tensors.empty())
+                    {
+                        kernel->run(win, info);
+                    }
+                    else
+                    {
+                        kernel->run_op(tensors, win, info);
+                    }
+                };
+            }
+            run_workloads(workloads);
+        }
+    }
+#else  /* !BARE_METAL */
+    ARM_COMPUTE_UNUSED(kernel, hints, window, tensors);
+#endif /* !BARE_METAL */
+}
+
 void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const char *tag)
 {
     ARM_COMPUTE_UNUSED(tag);
     run_workloads(workloads);
 }
 
+std::size_t IScheduler::adjust_num_of_windows(const Window     &window,
+                                              std::size_t       split_dimension,
+                                              std::size_t       init_num_windows,
+                                              const ICPPKernel &kernel,
+                                              const CPUInfo    &cpu_info)
+{
+    // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow").
+    if (window.num_iterations(split_dimension) < init_num_windows)
+    {
+        auto recommended_split_dim = Window::DimX;
+        for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
+        {
+            if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
+            {
+                recommended_split_dim = dims;
+            }
+        }
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+            "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim",
+            split_dimension, recommended_split_dim);
+    }
+
+    for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
+    {
+        // Try splitting the workload into t, subject to each subworkload size <= mws.
+        if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
+        {
+            if (t != init_num_windows)
+            {
+                ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                    "The scheduler is using a different thread count than the one assigned by the user.");
+            }
+            return t;
+        }
+    }
+    ARM_COMPUTE_LOG_INFO_MSG_CORE(
+        "The scheduler is using single thread instead of the thread count assigned by the user.");
+    return 1; //  If the workload is so small that it can't be split, we should run a single thread
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index d0c9919e26..8e5b62ae7d 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager()
 
 void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
 {
-    if(_active_group == nullptr)
+    if (_active_group == nullptr)
     {
         ARM_COMPUTE_ERROR_ON(group == nullptr);
         _active_group = group;
@@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
 
 bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
 {
-    if(group == nullptr)
+    if (group == nullptr)
     {
         return false;
     }
     const bool status = bool(_finalized_groups.erase(group));
-    if(status)
+    if (status)
     {
         group->mappings().clear();
     }
@@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
 void ISimpleLifetimeManager::start_lifetime(void *obj)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements),
+                             "Memory object is already registered!");
 
     // Check if there is a free blob
-    if(_free_blobs.empty())
+    if (_free_blobs.empty())
     {
-        _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
+        _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}});
     }
     else
     {
@@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
     el.status    = true;
 
     // Find object in the occupied lists
-    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
-    {
-        return obj == b.id;
-    });
+    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs),
+                                         [&obj](const Blob &b) { return obj == b.id; });
     ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
 
     // Update occupied blob and return as free
@@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
     _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
 
     // Check if all objects are finalized and reset active group
-    if(are_all_finalized())
+    if (are_all_finalized())
     {
         ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
 
@@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
 
 bool ISimpleLifetimeManager::are_all_finalized() const
 {
-    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
-    {
-        return !e.second.status;
-    });
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements),
+                        [](const std::pair<void *, Element> &e) { return !e.second.status; });
 }
 } // namespace arm_compute
diff --git a/src/runtime/ITensorAllocator.cpp b/src/runtime/ITensorAllocator.cpp
index 087f324922..fe3d2804cb 100644
--- a/src/runtime/ITensorAllocator.cpp
+++ b/src/runtime/ITensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,25 +30,27 @@
 
 using namespace arm_compute;
 
-ITensorAllocator::ITensorAllocator()
-    : _info(), _alignment(0)
+void ITensorAllocator::init(const TensorInfo &input, size_t alignment)
 {
+    _info_owned    = input;
+    _info_external = nullptr;
+    _alignment     = alignment;
 }
 
-void ITensorAllocator::init(const TensorInfo &input, size_t alignment)
+void ITensorAllocator::soft_init(TensorInfo &input, size_t alignment)
 {
-    _info      = input;
-    _alignment = alignment;
+    _info_external = &input;
+    _alignment     = alignment;
 }
 
 TensorInfo &ITensorAllocator::info()
 {
-    return _info;
+    return (_info_external != nullptr) ? *_info_external : _info_owned;
 }
 
 const TensorInfo &ITensorAllocator::info() const
 {
-    return _info;
+    return (_info_external != nullptr) ? *_info_external : _info_owned;
 }
 
 size_t ITensorAllocator::alignment() const
diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp
index b367b5f70b..96287dcc49 100644
--- a/src/runtime/IWeightsManager.cpp
+++ b/src/runtime/IWeightsManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,23 +25,27 @@
 
 namespace arm_compute
 {
-IWeightsManager::IWeightsManager()
-    : _managed_weights(), _managed_weights_parents()
+IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents()
 {
 }
 
 void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent)
 {
-    if(!are_weights_managed(weights))
+    if (!are_weights_managed(weights))
     {
         _managed_weights[weights];
+        _managed_counter[weights];
+    }
+    else
+    {
+        _managed_counter[weights].counter++;
     }
 
     // In case the weights are an output of a previous reshape function
     // store the parent's link
-    if(parent != nullptr)
+    if (parent != nullptr)
     {
-        if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+        if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
         {
             _managed_weights_parents[weights] = parent;
         }
@@ -54,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Find if I have the same weights with weights transform. If I do, don't run the reshape
     auto     item = _managed_weights.find(weights);
-    bool     perform_run{ true };
-    ITensor *weights_tensor{ nullptr };
+    bool     perform_run{true};
+    ITensor *weights_tensor{nullptr};
 
     // Check if I already have the requested transform and I have run the reshape function
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->is_reshape_run() && (it->uid() == weights_transform->uid()))
+        if (it->is_reshape_run() && (it->uid() == weights_transform->uid()))
         {
             weights_tensor = it->get_weights();
             perform_run    = false;
@@ -68,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
         }
     }
 
-    if(perform_run)
+    if (perform_run)
     {
         weights_transform->run();
         weights_tensor = weights_transform->get_weights();
@@ -76,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Check if we can release memory from parent
     auto parent_item = _managed_weights_parents.find(weights);
-    if(parent_item != _managed_weights_parents.end())
+    if (parent_item != _managed_weights_parents.end())
     {
         int32_t refcount = parent_item->second->decrease_refcount();
-        if(refcount == 0)
+        if (refcount == 0)
         {
             parent_item->second->release();
         }
@@ -87,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Check top level weights. If all the transformations are done
     // mark the weights as unused
-    if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+    if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
     {
         auto item           = _managed_weights.find(weights);
         bool mark_as_unused = true;
-        for(auto it : item->second)
+        for (auto it : item->second)
         {
-            if(!it->is_reshape_run())
+            if (!it->is_reshape_run())
             {
                 mark_as_unused = false;
                 break;
             }
         }
 
-        if(mark_as_unused)
+        if (mark_as_unused)
         {
             weights->mark_as_unused();
         }
@@ -118,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
 {
     ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed");
 
-    ITensor *transformed_weights{ nullptr };
+    ITensor *transformed_weights{nullptr};
     auto     item = _managed_weights.find(weights);
 
     // Check if I already have the requested transform. If I do,
     // increase the refcount of the transformed weights object and
     // reuse the tensor
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->uid() == weights_transform->uid())
+        if (it->uid() == weights_transform->uid())
         {
             transformed_weights = it->get_weights();
             it->increase_refcount();
@@ -134,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
         }
     }
 
-    if(transformed_weights == nullptr)
+    if (transformed_weights == nullptr)
     {
         transformed_weights = weights_transform->get_weights();
         weights_transform->increase_refcount();
@@ -146,4 +150,28 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
 
     return transformed_weights;
 }
+
+void IWeightsManager::release(const ITensor *weights)
+{
+    if (weights == nullptr || !are_weights_managed(weights))
+    {
+        return;
+    }
+
+    _managed_counter[weights].counter--;
+    if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
+    {
+        weights->mark_as_unused();
+    }
+}
+
+void IWeightsManager::pre_mark_as_unused(const ITensor *weights)
+{
+    if (weights == nullptr || !are_weights_managed(weights))
+    {
+        return;
+    }
+
+    _managed_counter[weights].is_unused = true;
+}
 } // namespace arm_compute
diff --git a/src/runtime/Lut.cpp b/src/runtime/Lut.cpp
deleted file mode 100644
index 1b3daf1f60..0000000000
--- a/src/runtime/Lut.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/Lut.h"
-
-#include <cstring>
-
-using namespace arm_compute;
-
-Lut::Lut()
-    : _allocator()
-{
-}
-
-Lut::Lut(size_t num_elements, DataType data_type)
-    : _allocator()
-{
-    _allocator.init(num_elements, data_type);
-}
-
-size_t Lut::num_elements() const
-{
-    return _allocator.num_elements();
-}
-
-uint32_t Lut::index_offset() const
-{
-    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
-}
-
-size_t Lut::size_in_bytes() const
-{
-    return _allocator.size();
-}
-
-DataType Lut::type() const
-{
-    return _allocator.type();
-}
-
-uint8_t *Lut::buffer() const
-{
-    return _allocator.data();
-}
-
-void Lut::clear()
-{
-    ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
-    std::memset(this->buffer(), 0, this->size_in_bytes());
-}
-
-ILutAllocator *Lut::allocator()
-{
-    return &_allocator;
-}
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
deleted file mode 100644
index 4b77a4533f..0000000000
--- a/src/runtime/LutAllocator.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/LutAllocator.h"
-
-using namespace arm_compute;
-
-LutAllocator::LutAllocator()
-    : _buffer()
-{
-}
-
-uint8_t *LutAllocator::data() const
-{
-    return _buffer.data();
-}
-
-void LutAllocator::allocate()
-{
-    _buffer.resize(size());
-}
-
-uint8_t *LutAllocator::lock()
-{
-    return _buffer.data();
-}
-
-void LutAllocator::unlock()
-{
-}
diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
deleted file mode 100644
index 054169ac46..0000000000
--- a/src/runtime/MEMUtils.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Error.h"
-#include "support/StringSupport.h"
-
-#ifndef BARE_METAL
-#include <fstream>
-#include <iterator>
-#include <sstream>
-#endif // ifndef BARE_METAL
-
-namespace
-{
-void parse_mem_info(size_t &total, size_t &free, size_t &buffer)
-{
-    free   = 0;
-    total  = 0;
-    buffer = 0;
-#ifndef BARE_METAL
-    size_t        memcache = 0;
-    size_t        memfree  = 0;
-    std::ifstream meminfo_f;
-    meminfo_f.open("/proc/meminfo", std::ios::in);
-
-    if(meminfo_f.is_open())
-    {
-        std::string line;
-        while(bool(getline(meminfo_f, line)))
-        {
-            std::istringstream       iss(line);
-            std::vector<std::string> tokens((std::istream_iterator<std::string>(iss)),
-                                            std::istream_iterator<std::string>());
-            if(tokens[0] == "MemTotal:")
-            {
-                total = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
-            }
-            else if(tokens[0] == "MemFree:")
-            {
-                memfree = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
-            }
-            else if(tokens[0] == "Buffers:")
-            {
-                buffer = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
-            }
-            else if(tokens[0] == "Cached:")
-            {
-                memcache = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
-            }
-        }
-        free = memfree + (buffer + memcache);
-    }
-#endif // ifndef BARE_METAL
-}
-
-} // namespace
-
-namespace arm_compute
-{
-void MEMInfo::set_policy(MemoryPolicy policy)
-{
-    _policy = policy;
-}
-
-MemoryPolicy MEMInfo::get_policy()
-{
-    return _policy;
-}
-MemoryPolicy MEMInfo::_policy = { MemoryPolicy::NORMAL };
-
-MEMInfo::MEMInfo()
-    : _total(0), _free(0), _buffer(0)
-{
-    parse_mem_info(_total, _free, _buffer);
-}
-
-size_t MEMInfo::get_total_in_kb() const
-{
-    return _total;
-}
-
-} // namespace arm_compute
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index c6b956d929..90fd025eb7 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,20 +27,17 @@
 
 namespace arm_compute
 {
-Memory::Memory()
-    : _region(nullptr), _region_owned(nullptr)
+Memory::Memory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-Memory::Memory(IMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index d9803a8caf..5fa9ea47e9 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,8 @@
 
 namespace arm_compute
 {
-MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+                                             std::shared_ptr<IPoolManager>     pool_manager)
     : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager))
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
@@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t
 
     // Create pools
     auto pool_template = _lifetime_mgr->create_pool(&allocator);
-    for(int i = num_pools; i > 1; --i)
+    for (int i = num_pools; i > 1; --i)
     {
         auto pool = pool_template->duplicate();
         _pool_mgr->register_pool(std::move(pool));
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
deleted file mode 100644
index 1584e3ab8e..0000000000
--- a/src/runtime/MultiHOG.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/MultiHOG.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IMultiHOG.h"
-
-using namespace arm_compute;
-
-MultiHOG::MultiHOG(size_t num_models)
-    : _num_models(num_models), _model()
-{
-    _model.resize(_num_models);
-}
-
-size_t MultiHOG::num_models() const
-{
-    return _num_models;
-}
-
-IHOG *MultiHOG::model(size_t index)
-{
-    ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (&_model[index]);
-}
-
-const IHOG *MultiHOG::model(size_t index) const
-{
-    ARM_COMPUTE_ERROR_ON(index >= _num_models);
-    return (&_model[index]);
-}
diff --git a/src/runtime/MultiImage.cpp b/src/runtime/MultiImage.cpp
deleted file mode 100644
index eec58d3ef4..0000000000
--- a/src/runtime/MultiImage.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/MultiImage.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-MultiImage::MultiImage()
-    : _info(), _plane()
-{
-}
-
-const MultiImageInfo *MultiImage::info() const
-{
-    return &_info;
-}
-
-void MultiImage::init(unsigned int width, unsigned int height, Format format)
-{
-    internal_init(width, height, format, false);
-}
-
-void MultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
-{
-    internal_init(width, height, format, true);
-}
-
-void MultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
-{
-    TensorShape shape = adjust_odd_shape(TensorShape{ width, height }, format);
-    TensorInfo  info(shape, Format::U8);
-
-    if(auto_padding)
-    {
-        info.auto_padding();
-    }
-
-    switch(format)
-    {
-        case Format::U8:
-        case Format::S16:
-        case Format::U16:
-        case Format::S32:
-        case Format::F16:
-        case Format::F32:
-        case Format::U32:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-        {
-            TensorInfo info_full(shape, format);
-
-            if(auto_padding)
-            {
-                info_full.auto_padding();
-            }
-
-            std::get<0>(_plane).allocator()->init(info_full);
-            break;
-        }
-        case Format::NV12:
-        case Format::NV21:
-        {
-            const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88);
-            TensorInfo        info_uv88(shape_uv88, Format::UV88);
-
-            if(auto_padding)
-            {
-                info_uv88.auto_padding();
-            }
-
-            std::get<0>(_plane).allocator()->init(info);
-            std::get<1>(_plane).allocator()->init(info_uv88);
-            break;
-        }
-        case Format::IYUV:
-        {
-            const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV);
-            TensorInfo        info_sub2(shape_sub2, Format::U8);
-
-            if(auto_padding)
-            {
-                info_sub2.auto_padding();
-            }
-
-            std::get<0>(_plane).allocator()->init(info);
-            std::get<1>(_plane).allocator()->init(info_sub2);
-            std::get<2>(_plane).allocator()->init(info_sub2);
-            break;
-        }
-        case Format::YUV444:
-            std::get<0>(_plane).allocator()->init(info);
-            std::get<1>(_plane).allocator()->init(info);
-            std::get<2>(_plane).allocator()->init(info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    _info.init(shape.x(), shape.y(), format);
-}
-
-void MultiImage::allocate()
-{
-    switch(_info.format())
-    {
-        case Format::U8:
-        case Format::S16:
-        case Format::U16:
-        case Format::S32:
-        case Format::F16:
-        case Format::F32:
-        case Format::U32:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            std::get<0>(_plane).allocator()->allocate();
-            break;
-        case Format::NV12:
-        case Format::NV21:
-            std::get<0>(_plane).allocator()->allocate();
-            std::get<1>(_plane).allocator()->allocate();
-            break;
-        case Format::IYUV:
-        case Format::YUV444:
-            std::get<0>(_plane).allocator()->allocate();
-            std::get<1>(_plane).allocator()->allocate();
-            std::get<2>(_plane).allocator()->allocate();
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-}
-
-void MultiImage::create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height)
-{
-    arm_compute::Format format = image->info()->format();
-    TensorInfo          info(width, height, Format::U8);
-
-    switch(format)
-    {
-        case Format::U8:
-        case Format::S16:
-        case Format::U16:
-        case Format::S32:
-        case Format::F32:
-        case Format::F16:
-        case Format::U32:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-        {
-            TensorInfo info_full(width, height, format);
-            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info_full);
-            break;
-        }
-        case Format::NV12:
-        case Format::NV21:
-        {
-            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
-            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
-            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_uv88);
-            break;
-        }
-        case Format::IYUV:
-        {
-            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
-            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
-            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_sub2);
-            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(2))->allocator(), coords, info_sub2);
-            break;
-        }
-        case Format::YUV444:
-            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
-            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
-            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    _info.init(width, height, format);
-}
-
-Image *MultiImage::plane(unsigned int index)
-{
-    return &_plane[index];
-}
-
-const Image *MultiImage::plane(unsigned int index) const
-{
-    return &_plane[index];
-}
diff --git a/src/runtime/NEON/functions/NEUpsampleLayer.cpp b/src/runtime/NEON/INEOperator.cpp
index 9be96af66a..fcfd3251ff 100644
--- a/src/runtime/NEON/functions/NEUpsampleLayer.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,32 +21,46 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
-NEUpsampleLayer::NEUpsampleLayer()
-    : _kernel(), _data_layout()
+namespace experimental
+{
+INEOperator::~INEOperator() = default;
+
+INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
 {
 }
 
-Status NEUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info,
-                                 const InterpolationPolicy &policy)
+void INEOperator::run(ITensorPack &tensors)
+{
+    if (tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    run(tensors, _kernel->window());
+}
+
+void INEOperator::run(ITensorPack &tensors, const Window &window)
 {
-    return NEUpsampleLayerKernel::validate(input, output, info, policy);
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, window, tensors);
 }
 
-void NEUpsampleLayer::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy &policy)
+void INEOperator::prepare(ITensorPack &constants)
 {
-    _data_layout = input->info()->data_layout();
-    _kernel.configure(input, output, info, policy);
+    ARM_COMPUTE_UNUSED(constants);
 }
 
-void NEUpsampleLayer::run()
+MemoryRequirements INEOperator::workspace() const
 {
-    const auto win = (_data_layout == DataLayout::NCHW) ? Window::DimZ : Window::DimX;
-    NEScheduler::get().schedule(&_kernel, win);
+    return _workspace;
 }
+} // namespace experimental
 } // namespace arm_compute
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 23d9872294..b6977221b9 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,24 @@
  */
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
+namespace arm_compute
+{
+INESimpleFunction::~INESimpleFunction() = default;
 
 INESimpleFunction::INESimpleFunction() // NOLINT
-    : _kernel(),
-      _border_handler()
+    : _kernel(), _border_handler()
 {
 }
 
 void INESimpleFunction::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
     NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
+} //namespace arm_compute
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 2cabee4c46..04bff9fa4b 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,19 +23,22 @@
  */
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Utils.h"
+
+#include "src/core/NEON/INEKernel.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
-INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
-    : _kernel(),
-      _ctx(ctx)
+INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default;
+
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx)
 {
 }
 
 void INESimpleFunctionNoBorder::run()
 {
-    schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
+    utils::schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
deleted file mode 100644
index 06b38a9b9e..0000000000
--- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
-
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
deleted file mode 100644
index 47ea83de93..0000000000
--- a/src/runtime/NEON/functions/NEAccumulate.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
-
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEAccumulate::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEAccumulateKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16)
-{
-    if(use_fp16)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
-        k->configure(input, alpha, output);
-        _kernel = std::move(k);
-    }
-    else
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEAccumulateWeightedKernel>();
-        k->configure(input, alpha, output);
-        _kernel = std::move(k);
-    }
-}
-
-void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEAccumulateSquaredKernel>();
-    k->configure(input, shift, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index e4d1125c79..59199452ce 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,50 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/runtime/IRuntimeContext.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuActivation.h"
 
 namespace arm_compute
 {
-NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) // NOLINT
-    : INESimpleFunctionNoBorder(ctx)
+struct NEActivationLayer::Impl
+{
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    IRuntimeContext                    *ctx{nullptr};
+    std::unique_ptr<cpu::CpuActivation> op{nullptr};
+};
+
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
+    _impl->ctx = ctx;
 }
+NEActivationLayer::NEActivationLayer(NEActivationLayer &&)            = default;
+NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default;
+NEActivationLayer::~NEActivationLayer()                               = default;
+
 void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
-    k->configure(input, output, activation_info);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output == nullptr ? input : output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+
+    _impl->op = std::make_unique<cpu::CpuActivation>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info);
+}
+
+Status
+NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return cpu::CpuActivation::validate(input, output, act_info);
 }
 
-Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+void NEActivationLayer::run()
 {
-    return NEActivationLayerKernel::validate(input, output, act_info);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp
new file mode 100644
index 0000000000..a72364791c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
+
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
+namespace arm_compute
+{
+struct NEAddMulAdd::Impl
+{
+    std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr};
+    WorkspaceData<Tensor>              workspace_tensors{};
+    ITensorPack                        run_pack{};
+    MemoryGroup                        memory_group{};
+};
+
+NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+    _impl->memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+NEAddMulAdd::~NEAddMulAdd() = default;
+
+void NEAddMulAdd::configure(ITensor                   *input1,
+                            ITensor                   *input2,
+                            ITensor                   *bn_mul,
+                            ITensor                   *bn_add,
+                            ITensor                   *add_output,
+                            ITensor                   *final_output,
+                            const ConvertPolicy        policy,
+                            const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+
+    _impl->op = std::make_unique<cpu::CpuAddMulAdd>();
+    _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(),
+                         add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
+
+    _impl->run_pack = {
+        {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2},     {TensorType::ACL_SRC_2, bn_mul},
+        {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output},
+    };
+
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status NEAddMulAdd::validate(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             const ITensorInfo         *add_output,
+                             const ITensorInfo         *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
+{
+    return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+}
+
+void NEAddMulAdd::run()
+{
+    _impl->op->run(_impl->run_pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index a23061e296..fbaf1a96e7 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,28 +29,68 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
-NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _reduction_function(support::cpp14::make_unique<NEReductionOperation>())
+struct NEArgMinMaxLayer::Impl
 {
-    ARM_COMPUTE_UNUSED(memory_manager);
+    MemoryGroup                           memory_group{};
+    std::shared_ptr<IMemoryManager>       memory_manager{};
+    std::unique_ptr<NEReductionOperation> reduction_function{};
+    std::unique_ptr<NECast>               cast_function{};
+    std::unique_ptr<Tensor>               tmp_reduction_result{};
+};
+
+NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
+
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+    _impl->memory_manager = std::move(memory_manager);
 }
+
 void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
 {
-    _reduction_function->configure(input, output, axis, op, false);
+    ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
+    _impl->reduction_function = std::make_unique<NEReductionOperation>();
+    if (output->info() &&
+        (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
+    {
+        _impl->memory_group         = MemoryGroup(std::move(_impl->memory_manager));
+        _impl->cast_function        = std::make_unique<NECast>();
+        _impl->tmp_reduction_result = std::make_unique<Tensor>();
+        _impl->reduction_function->configure(input, _impl->tmp_reduction_result.get(), axis, op, false);
+        _impl->cast_function->configure(_impl->tmp_reduction_result.get(), output, ConvertPolicy::SATURATE);
+        _impl->memory_group.manage(_impl->tmp_reduction_result.get());
+        _impl->tmp_reduction_result->allocator()->allocate();
+    }
+    else
+    {
+        _impl->reduction_function->configure(input, output, axis, op, false);
+    }
 }
 
-Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid operation");
     return NEReductionOperation::validate(input, output, axis, op, false);
 }
 
 void NEArgMinMaxLayer::run()
 {
-    _reduction_function->run();
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->reduction_function->run();
+    if (_impl->tmp_reduction_result != nullptr)
+    {
+        _impl->cast_function->run();
+    }
 }
 
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 06c71db1bd..aff16ae9d1 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,24 +23,57 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuAdd.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+struct NEArithmeticAddition::Impl
+{
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuAdd> op{nullptr};
+};
+
+NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>())
+{
+}
+NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&)            = default;
+NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default;
+NEArithmeticAddition::~NEArithmeticAddition()                                  = default;
+
+Status NEArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
+{
+    return cpu::CpuAdd::validate(input1, input2, output, policy, act_info);
+}
+
+void NEArithmeticAddition::configure(const ITensor             *input1,
+                                     const ITensor             *input2,
+                                     ITensor                   *output,
+                                     ConvertPolicy              policy,
+                                     const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuAdd>();
+    _impl->op->configure(_impl->src_0->info(), _impl->src_1->info(), _impl->dst->info(), policy, act_info);
 }
-Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+
+void NEArithmeticAddition::run()
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEArithmeticAdditionKernel::validate(input1, input2, output, policy);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 20f930a286..097525c1a8 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,24 +24,56 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/cpu/operators/CpuSub.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void NEArithmeticSubtraction::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+struct NEArithmeticSubtraction::Impl
+{
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuSub> op{nullptr};
+};
+
+NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>())
+{
+}
+NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&)            = default;
+NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction::~NEArithmeticSubtraction()                                     = default;
+
+Status NEArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
+{
+    return cpu::CpuSub::validate(input1, input2, output, policy, act_info);
+}
+
+void NEArithmeticSubtraction::configure(const ITensor             *input1,
+                                        const ITensor             *input2,
+                                        ITensor                   *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuSub>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticSubtraction::run()
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index bb224db163..d491f0aafc 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,28 +30,48 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
-NEBatchNormalizationLayer::NEBatchNormalizationLayer()
-    : _norm_kernel()
+namespace arm_compute
+{
+NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
+
+NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel()
 {
 }
 
-void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon,
+void NEBatchNormalizationLayer::configure(ITensor            *input,
+                                          ITensor            *output,
+                                          const ITensor      *mean,
+                                          const ITensor      *var,
+                                          const ITensor      *beta,
+                                          const ITensor      *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
     // Configure kernel
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
+    _norm_kernel = std::make_unique<NEBatchNormalizationLayerKernel>();
+    _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
     return Status{};
 }
 
 void NEBatchNormalizationLayer::run()
 {
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index a4db1fdda3..5d711c5ddf 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,29 +29,39 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+
 namespace arm_compute
 {
 void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
+    auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
     k->configure(input, block_shape, output);
     _kernel = std::move(k);
 }
 
-void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output)
+void NEBatchToSpaceLayer::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEBatchToSpaceLayerKernel>();
-    k->configure(input, block_shape_x, block_shape_y, output);
+    auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
+    k->configure(input, block_shape_x, block_shape_y, output, crop_info);
     _kernel = std::move(k);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
-    return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+    return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 98f4745179..89ce2087be 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 
 #include <utility>
 
@@ -32,7 +32,8 @@ using namespace arm_compute;
 
 void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseAndKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<NEBitwiseAndKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 173b7d5f74..eda59cd3e9 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 
 #include <utility>
 
@@ -32,7 +32,8 @@ using namespace arm_compute;
 
 void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseNotKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output);
+    auto k = std::make_unique<NEBitwiseNotKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index 64f1d82350..3d6f30b0fe 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 
 #include <utility>
 
@@ -32,7 +32,8 @@ using namespace arm_compute;
 
 void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseOrKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<NEBitwiseOrKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 28c1036112..f0cf3d3e5c 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 
 #include <utility>
 
@@ -32,7 +32,8 @@ using namespace arm_compute;
 
 void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEBitwiseXorKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+    auto k = std::make_unique<NEBitwiseXorKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index 6f767c7d31..adf891e417 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,19 +23,27 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
 
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransform::configure(const ITensor                  *boxes,
+                                       ITensor                        *pred_boxes,
+                                       const ITensor                  *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
+    ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
     // Configure Bounding Box kernel
-    auto k = arm_compute::support::cpp14::make_unique<NEBoundingBoxTransformKernel>();
+    auto k = std::make_unique<NEBoundingBoxTransformKernel>();
     k->configure(boxes, pred_boxes, deltas, info);
     _kernel = std::move(k);
 }
 
-Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
deleted file mode 100644
index 096b226ab5..0000000000
--- a/src/runtime/NEON/functions/NEBox3x3.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
-
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
-{
-    if(use_fp16)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEBox3x3FP16Kernel>();
-        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-        _kernel = std::move(k);
-    }
-    else
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEBox3x3Kernel>();
-        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-        _kernel = std::move(k);
-    }
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
deleted file mode 100644
index a57ea606a9..0000000000
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
-#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
-#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/MemorySupport.h"
-
-#include <cstring>
-#include <inttypes.h>
-#include <utility>
-
-using namespace arm_compute;
-
-NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(),
-      _gradient(),
-      _non_max_suppr(),
-      _edge_trace(),
-      _border_mag_gradient(),
-      _border_edge_trace(),
-      _gx(),
-      _gy(),
-      _magnitude(),
-      _phase(),
-      _nonmax(),
-      _output(nullptr)
-{
-}
-
-void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
-    ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7));
-    ARM_COMPUTE_ERROR_ON((lower_thr < 0) || (lower_thr >= upper_thr));
-
-    _output = output;
-
-    const TensorShape &shape = input->info()->tensor_shape();
-    TensorInfo         gradient_info;
-    TensorInfo         magnitude_info;
-
-    // Initialize images
-    if(gradient_size < 7)
-    {
-        gradient_info.init(shape, Format::S16);
-        magnitude_info.init(shape, Format::U16);
-    }
-    else
-    {
-        gradient_info.init(shape, Format::S32);
-        magnitude_info.init(shape, Format::U32);
-    }
-
-    _gx.allocator()->init(gradient_info);
-    _gy.allocator()->init(gradient_info);
-    _magnitude.allocator()->init(magnitude_info);
-
-    TensorInfo info(shape, Format::U8);
-    _phase.allocator()->init(info);
-    _nonmax.allocator()->init(info);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Configure/Init sobelNxN
-    if(gradient_size == 3)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
-        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 5)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
-        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else if(gradient_size == 7)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
-        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
-        _sobel = std::move(k);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_VAR("Gradient size %+" PRId32 " not supported\n", gradient_size);
-    }
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_magnitude);
-    _memory_group.manage(&_phase);
-
-    // Configure gradient
-    auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
-    k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
-    _gradient = std::move(k);
-
-    // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Configure non-maxima suppression
-    _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
-
-    // Fill border around magnitude image as non-maxima suppression will access
-    // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
-
-    // Allocate intermediate tensors
-    _phase.allocator()->allocate();
-    _magnitude.allocator()->allocate();
-
-    // Configure edge tracing
-    _edge_trace.configure(&_nonmax, output);
-
-    // Fill border with "No edge" to stop recursion in edge trace
-    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
-
-    // Allocate intermediate tensors
-    _nonmax.allocator()->allocate();
-}
-
-void NECannyEdge::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run sobelNxN
-    _sobel->run();
-
-    // Run gradient
-    NEScheduler::get().schedule(_gradient.get(), Window::DimY);
-
-    // Fill border before non-maxima suppression. Nop for border mode undefined.
-    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
-
-    // Run non-maxima suppression
-    NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
-
-    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-    std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
-
-    // Fill border before edge trace
-    NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
-
-    // Run edge tracing
-    NEScheduler::get().schedule(&_edge_trace, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index 464a608c8c..1fd172a730 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,24 +23,46 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
 
-#include <utility>
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuCast.h"
 
 namespace arm_compute
 {
+struct NECast::Impl
+{
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
+};
+
+NECast::NECast() : _impl(std::make_unique<Impl>())
+{
+}
+NECast::NECast(NECast &&)            = default;
+NECast &NECast::operator=(NECast &&) = default;
+NECast::~NECast()                    = default;
+
 void NECast::configure(ITensor *input, ITensor *output, ConvertPolicy policy)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
-    k->configure(input, output, policy, 0);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    ARM_COMPUTE_LOG_PARAMS(input, output, policy);
+    _impl->op = std::make_unique<cpu::CpuCast>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
+}
+
+Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return cpu::CpuCast::validate(input, output, policy);
 }
 
-Status NECast::validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy)
+void NECast::run()
 {
-    return NEDepthConvertLayerKernel::validate(input, output, policy, 0);
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
deleted file mode 100644
index 37e92c28fd..0000000000
--- a/src/runtime/NEON/functions/NEChannelCombine.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
-
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
-    k->configure(plane0, plane1, plane2, plane3, output);
-    _kernel = std::move(k);
-}
-
-void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEChannelCombineKernel>();
-    k->configure(plane0, plane1, plane2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
deleted file mode 100644
index 37a9892d07..0000000000
--- a/src/runtime/NEON/functions/NEChannelExtract.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
-
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
-    k->configure(input, channel, output);
-    _kernel = std::move(k);
-}
-
-void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEChannelExtractKernel>();
-    k->configure(input, channel, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index 46d77831c4..86bee4dd43 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,17 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 
 namespace arm_compute
 {
 void NEChannelShuffleLayer::configure(const ITensor *input, ITensor *output, unsigned int num_groups)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEChannelShuffleLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
+    auto k = std::make_unique<NEChannelShuffleLayerKernel>();
     k->configure(input, output, num_groups);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
deleted file mode 100644
index fff7633833..0000000000
--- a/src/runtime/NEON/functions/NEColorConvert.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
-
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEColorConvert::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-void NEColorConvert::configure(const IMultiImage *input, IImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-void NEColorConvert::configure(const IImage *input, IMultiImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEColorConvertKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 61d41b44f8..59a0892f1f 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,162 +23,69 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/operators/CpuConcatenate.h"
 
 namespace arm_compute
 {
-NEConcatenateLayer::NEConcatenateLayer()
-    : _concat_kernels(),
-      _num_inputs(0),
-      _axis(Window::DimX)
+struct NEConcatenateLayer::Impl
 {
-}
-
-void NEConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis)
+    std::vector<const ITensor *>         srcs{};
+    ITensor                             *dst{nullptr};
+    unsigned int                         num_inputs{0};
+    unsigned int                         axis{0};
+    std::unique_ptr<cpu::CpuConcatenate> op{nullptr};
+};
+
+NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
-    configure_internal(std::move(inputs_vector), output, axis);
 }
+NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&)            = default;
+NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default;
+NEConcatenateLayer::~NEConcatenateLayer()                                = default;
 
 void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis)
 {
-    configure_internal(std::move(inputs_vector), output, axis);
-}
-
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    return validate_internal(inputs_vector, output, axis);
-}
-
-Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    return validate_internal(inputs_vector, output, axis);
-}
-
-template <typename TensorType, typename>
-void NEConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis)
-{
     ARM_COMPUTE_ERROR_ON(output == nullptr);
-    _axis       = axis;
-    _num_inputs = inputs_vector.size();
 
-    std::vector<ITensorInfo *> inputs_vector_info;
-    inputs_vector_info.reserve(_num_inputs);
-    for(unsigned int i = 0; i < _num_inputs; ++i)
+    _impl->srcs       = inputs_vector;
+    _impl->dst        = output;
+    _impl->axis       = axis;
+    _impl->num_inputs = inputs_vector.size();
+    _impl->op         = std::make_unique<cpu::CpuConcatenate>();
+
+    std::vector<const ITensorInfo *> inputs_vector_info;
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
     }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
-
-    unsigned int offset = 0;
-
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        switch(_axis)
-        {
-            case Window::DimX:
-            {
-                auto kernel = support::cpp14::make_unique<NEWidthConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            case Window::DimY:
-            {
-                auto kernel = support::cpp14::make_unique<NEHeightConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            case Window::DimZ:
-            {
-                auto kernel = support::cpp14::make_unique<NEDepthConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            case 3:
-            {
-                auto kernel = support::cpp14::make_unique<NEBatchConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
-                _concat_kernels.emplace_back(std::move(kernel));
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Axis not supported");
-        }
-        offset += inputs_vector.at(i)->info()->dimension(_axis);
-    }
+    _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis);
 }
 
-template <typename TensorInfoType, typename>
-Status NEConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
-
-    unsigned int offset = 0;
-    for(const auto &input : inputs_vector)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        switch(axis)
-        {
-            case Window::DimX:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayerKernel::validate(input, offset, output));
-                break;
-            }
-            case Window::DimY:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(NEHeightConcatenateLayerKernel::validate(input, offset, output));
-                break;
-            }
-            case Window::DimZ:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(NEDepthConcatenateLayerKernel::validate(input, offset, output));
-                break;
-            }
-            case 3:
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(NEBatchConcatenateLayerKernel::validate(input, offset, output));
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Axis not supported");
-        }
-        offset += input->dimension(axis);
-    }
-
-    if(output->total_size() != 0)
-    {
-        TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
-        ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
-    }
-
-    return Status{};
+    return cpu::CpuConcatenate::validate(inputs_vector, output, axis);
 }
 
 void NEConcatenateLayer::run()
 {
-    for(auto &kernel : _concat_kernels)
+    ITensorPack pack;
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
-        NEScheduler::get().schedule(kernel.get(), Window::DimY);
+        pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp
new file mode 100644
index 0000000000..8f41151d6c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConv3D.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDirectConv3d.h"
+
+namespace arm_compute
+{
+using namespace arm_compute::experimental;
+
+struct NEConv3D::Impl
+{
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
+    ITensorPack                        run_pack{};
+};
+
+NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>())
+{
+}
+
+NEConv3D::~NEConv3D() = default;
+
+void NEConv3D::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info);
+
+    auto f = std::make_unique<cpu::CpuDirectConv3d>();
+    f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(),
+                 conv_info);
+    _impl->op = std::move(f);
+
+    if (_impl->op != nullptr)
+    {
+        _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    }
+}
+
+Status NEConv3D::validate(const ITensorInfo *input,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          const Conv3dInfo  &conv_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info));
+
+    return Status{};
+}
+
+void NEConv3D::run()
+{
+    if (_impl->op != nullptr)
+    {
+        _impl->op->run(_impl->run_pack);
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index f65c035da6..84e8565aaf 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,27 +23,49 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
+
 namespace arm_compute
 {
-NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
-    : _kernel()
+struct NEConvertFullyConnectedWeights::Impl
+{
+    const ITensor                                        *src{nullptr};
+    ITensor                                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr};
+};
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
+NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
 
-void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void NEConvertFullyConnectedWeights::configure(const ITensor     *input,
+                                               ITensor           *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
-    _kernel.configure(input, output, original_input_shape, data_layout);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuConvertFullyConnectedWeights>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
-    return NEConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+    return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
 
 void NEConvertFullyConnectedWeights::run()
 {
-    NEScheduler::get().schedule(&_kernel, Window::DimZ);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
deleted file mode 100644
index 255cb3d704..0000000000
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/MemorySupport.h"
-
-#include <array>
-#include <utility>
-
-using namespace arm_compute;
-
-void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEConvolution3x3Kernel>();
-    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
-{
-}
-
-template <unsigned int matrix_size>
-void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
-                                                 uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(conv == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-
-    std::array<int16_t, matrix_size> conv_col{ { 0 } };
-    std::array<int16_t, matrix_size> conv_row{ { 0 } };
-
-    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
-
-    if(_is_separable)
-    {
-        DataType intermediate_type = DataType::UNKNOWN;
-        std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
-
-        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp);
-
-        // Calculate scale
-        if(scale == 0)
-        {
-            scale = calculate_matrix_scale(conv, matrix_size);
-        }
-
-        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
-
-        _tmp.allocator()->allocate();
-
-        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-    }
-    else
-    {
-        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
-    }
-}
-
-template <unsigned int matrix_size>
-void                   NEConvolutionSquare<matrix_size>::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-
-    if(_is_separable)
-    {
-        MemoryGroupResourceScope scope_mg(_memory_group);
-
-        NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
-        NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-    }
-    else
-    {
-        NEScheduler::get().schedule(&_kernel, Window::DimY);
-    }
-}
-
-template class arm_compute::NEConvolutionSquare<5>;
-template class arm_compute::NEConvolutionSquare<7>;
-template class arm_compute::NEConvolutionSquare<9>;
-
-void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEConvolutionRectangleKernel>();
-    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 4a779917a7..8efebbbb1a 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,174 +25,184 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
 
-#include <cmath>
-#include <tuple>
-#include <utility>
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuConv2d.h"
+#include "src/cpu/operators/CpuDirectConv2d.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
 
 namespace arm_compute
 {
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT
-    : _memory_manager(std::move(memory_manager)),
-      _function()
+using namespace arm_compute::experimental;
+
+struct NEConvolutionLayer::Impl
+{
+    MemoryGroup                        memory_group{};
+    std::shared_ptr<IMemoryManager>    memory_manager{};
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
+    ITensorPack                        run_pack{};
+    ITensorPack                        prep_pack{};
+    WorkspaceData<Tensor>              workspace{};
+    experimental::MemoryRequirements   aux_mem_req{};
+    std::unique_ptr<IFunction>         func{nullptr};
+};
+
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
+    _impl->memory_manager = std::move(memory_manager);
 }
 
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+NEConvolutionLayer::~NEConvolutionLayer() = default;
+
+void NEConvolutionLayer::configure(ITensor                   *input,
+                                   const ITensor             *weights,
+                                   const ITensor             *biases,
+                                   ITensor                   *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math));
-
-    switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
+
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+    switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+                                                   weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
-        {
-            auto f = arm_compute::support::cpp14::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
-            _function = std::move(f);
-            break;
-        }
         case ConvolutionMethod::GEMM:
-        {
-            auto f = arm_compute::support::cpp14::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
-            _function = std::move(f);
-            break;
-        }
+        case ConvolutionMethod::GEMM_CONV2D:
         case ConvolutionMethod::DIRECT:
         {
-            auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, act_info);
-            _function = std::move(f);
+            auto f = std::make_unique<cpu::CpuConv2d>();
+            f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr),
+                         output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+            _impl->op = std::move(f);
             break;
         }
         case ConvolutionMethod::FFT:
         {
-            auto f = arm_compute::support::cpp14::make_unique<NEFFTConvolutionLayer>(_memory_manager);
+            auto f = std::make_unique<NEFFTConvolutionLayer>(_impl->memory_manager);
             f->configure(input, weights, biases, output, conv_info, act_info);
-            _function = std::move(f);
+            _impl->func = std::move(f);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
     }
+
+    if (_impl->op)
+    {
+        _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+        _impl->aux_mem_req  = _impl->op->workspace();
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    }
 }
 
-Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON");
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
+
+    // Biases with dynamic values are not supported with quantized inputs.
+    if (biases)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((!biases->are_values_constant() && is_data_type_quantized(input->data_type())),
+                                        "Dynamic Biases are not supported with quantized input data.");
+    }
 
-    switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                   enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
-            //Validate Winograd
-            ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
-            break;
         case ConvolutionMethod::GEMM:
-            //Validate Gemm-based Convolution
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
-            break;
+        case ConvolutionMethod::GEMM_CONV2D:
         case ConvolutionMethod::DIRECT:
-            //Validate Direct Convolution
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info,
+                                                                 weights_info, dilation, act_info, enable_fast_math,
+                                                                 num_groups));
             break;
         case ConvolutionMethod::FFT:
-            // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
     }
-
     return Status{};
 }
 
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                             const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const Size2D              &dilation,
+                                                             const ActivationLayerInfo &act_info,
+                                                             bool                       enable_fast_math)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
-    ARM_COMPUTE_UNUSED(weights_info);
-
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
-    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
-    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
-    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
-
-    const std::vector<ConfigurationMethod> known_configs =
-    {
-        // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
-        // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
-        // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
-        // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
-    };
-
-    const auto find_config = [&](ConfigurationMethod c)
-    {
-        const ConvolutionConfiguration config = c.first;
-        const PadStrideInfo            info   = std::get<3>(config);
+    return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                  enable_fast_math);
+}
 
-        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
-    };
+void NEConvolutionLayer::run()
+{
+    prepare();
 
-    std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
-    {
-        return (*found).second;
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(dilation != Size2D(1U, 1U))
+    if (_impl->func)
     {
-        return ConvolutionMethod::GEMM;
+        _impl->func->run();
     }
     else
     {
-        // SRGAN
-        // Output might not be initialized when it is an internal tensor of the layer using the convolution
-        if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
-           && (NEDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
-        {
-            return ConvolutionMethod::DIRECT;
-        }
-        if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
-        {
-            return ConvolutionMethod::FFT;
-        }
-        if(input->dimension(idx_c) < 16)
-        {
-            return ConvolutionMethod::GEMM;
-        }
-        return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+        _impl->op->run(_impl->run_pack);
     }
 }
 
-void NEConvolutionLayer::run()
-{
-    prepare();
-    _function->run();
-}
-
 void NEConvolutionLayer::prepare()
 {
-    _function->prepare();
+    if (_impl->func)
+    {
+        _impl->func->prepare();
+    }
+    else
+    {
+        _impl->op->prepare(_impl->prep_pack);
+
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index 55c4faf9ab..c975d3a5b5 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,22 +23,51 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuCopy.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct NECopy::Impl
+{
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCopy> op{nullptr};
+};
+
+NECopy::NECopy() : _impl(std::make_unique<Impl>())
+{
+}
+NECopy::NECopy(NECopy &&)            = default;
+NECopy &NECopy::operator=(NECopy &&) = default;
+NECopy::~NECopy()                    = default;
+
 void NECopy::configure(ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NECopyKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuCopy>();
+    _impl->op->configure(input->info(), output->info());
+}
+
+Status NECopy::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuCopy::validate(input, output));
+
+    return Status{};
 }
 
-Status NECopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+void NECopy::run()
 {
-    return NECopyKernel::validate(input, output);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index cc39d0284e..a94b0882da 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,27 +21,47 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
 
-#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NECropKernel.h"
 
 #include <cstddef>
 
 namespace arm_compute
 {
+NECropResize::~NECropResize() = default;
+
 NECropResize::NECropResize()
-    : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+    : _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _crop(),
+      _scale(),
+      _crop_results(),
+      _scaled_results()
 {
 }
 
-Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status NECropResize::validate(const ITensorInfo  *input,
+                              const ITensorInfo  *boxes,
+                              const ITensorInfo  *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(),
+                                                       box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1,
+                                                       extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -51,11 +71,18 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes
     return Status{};
 }
 
-void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void NECropResize::configure(const ITensor      *input,
+                             const ITensor      *boxes,
+                             const ITensor      *box_ind,
+                             ITensor            *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
+    ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
     TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
@@ -75,20 +102,20 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I
     _scaled_results.reserve(_num_boxes);
     _scale.reserve(_num_boxes);
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
-        auto       crop_tensor = support::cpp14::make_unique<Tensor>();
+        auto       crop_tensor = std::make_unique<Tensor>();
         TensorInfo crop_result_info(1, DataType::F32);
         crop_result_info.set_data_layout(DataLayout::NHWC);
         crop_tensor->allocator()->init(crop_result_info);
 
-        auto       scale_tensor = support::cpp14::make_unique<Tensor>();
+        auto       scale_tensor = std::make_unique<Tensor>();
         TensorInfo scaled_result_info(out_shape, 1, DataType::F32);
         scaled_result_info.set_data_layout(DataLayout::NHWC);
         scale_tensor->allocator()->init(scaled_result_info);
 
-        auto crop_kernel  = support::cpp14::make_unique<NECropKernel>();
-        auto scale_kernel = support::cpp14::make_unique<NEScale>();
+        auto crop_kernel  = std::make_unique<NECropKernel>();
+        auto scale_kernel = std::make_unique<NEScale>();
         crop_kernel->configure(input, boxes, box_ind, crop_tensor.get(), i, _extrapolation_value);
 
         _crop.emplace_back(std::move(crop_kernel));
@@ -102,7 +129,7 @@ void NECropResize::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         // Size of the crop box in _boxes and thus the shape of _crop_results[i]
         // may not be known until run-time and so the kernels cannot be configured until then.
@@ -111,12 +138,15 @@ void NECropResize::run()
         NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
 
         // Scale the cropped image.
-        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false);
+        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(),
+                             ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value),
+                                             SamplingPolicy::TOP_LEFT, false});
         _scaled_results[i]->allocator()->allocate();
         _scale[i]->run();
 
         // Copy scaled image into output.
-        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(),
+                    _output->ptr_to_element(Coordinates(0, 0, 0, i)));
     }
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index dd53fbbdc3..081c7cc538 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,13 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
@@ -59,9 +62,9 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p
     deconv_pad_top += deconv_pad_y / 2;
     deconv_pad_bottom += deconv_pad_y / 2;
 
-    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom,
+                         DimensionRoundingType::FLOOR);
 }
-
 } // namespace
 
 NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
@@ -75,27 +78,54 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
       _original_weights(nullptr),
       _input(nullptr),
       _info(),
-      _is_prepared(false)
+      _is_prepared(false),
+      _do_upsampling(true)
 {
 }
 
-Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info)
+Status NEDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      const ITensorInfo   *output,
+                                      const PadStrideInfo &info,
+                                      bool                 enable_fast_math,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
-    const unsigned int width_idx  = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+    if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    }
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
+    const unsigned int pad_left   = info.pad_left();
+    const unsigned int pad_top    = info.pad_top();
+    const unsigned int pad_right  = info.pad_right();
+    const unsigned int pad_bottom = info.pad_bottom();
+
+    ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(width_idx) - 1) * info.stride().first +
+                                 weights->dimension(width_idx)) < (pad_left + pad_right));
+    ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(height_idx) - 1) * info.stride().second +
+                                 weights->dimension(height_idx)) < (pad_top + pad_bottom));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    if(bias != nullptr)
+    auto out_dims =
+        deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+                                        weights->dimension(width_idx), weights->dimension(height_idx), info);
+
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -105,46 +135,84 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
         }
     }
 
-    if(output->tensor_shape().total_size() > 0)
+    if (output->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
         const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+                                        "Output's width is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+                                        "Output's height is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+                                        "Output's depth is invalid.");
     }
 
-    uint32_t            deconv_pad_x    = 0;
-    uint32_t            deconv_pad_y    = 0;
-    const unsigned int  stride_x        = info.stride().first;
-    const unsigned int  stride_y        = info.stride().second;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+    uint32_t       deconv_pad_x   = 0;
+    uint32_t       deconv_pad_y   = 0;
+    const uint32_t stride_x       = info.stride().first;
+    const uint32_t stride_y       = info.stride().second;
+    const auto     deconv_padding = compute_deconvolution_padding(*input, *weights, static_cast<int32_t>(stride_x),
+                                                                  static_cast<int32_t>(stride_y), out_dims);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(deconv_padding.first < 0 || deconv_padding.second < 0,
+                                    "Negative padding not supported");
+
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                              out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+    const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+
+    // Do not perform upsampling when the operation uses unit stride in all dimensions
+    const bool do_upsampling = stride_x != 1 || stride_y != 1;
 
-    const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-    const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int batches_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+    if (do_upsampling)
+    {
+        const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info,
+                                                                 weights_info, Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
+    }
+    else
+    {
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info,
+                                                                 Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
+    }
 
     return Status{};
 }
 
-void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info)
+void NEDeconvolutionLayer::configure(ITensor             *input,
+                                     const ITensor       *weights,
+                                     const ITensor       *bias,
+                                     ITensor             *output,
+                                     const PadStrideInfo &info,
+                                     bool                 enable_fast_math,
+                                     const WeightsInfo   &weights_info)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(),
+                                                              (bias == nullptr) ? nullptr : bias->info(),
+                                                              output->info(), info, enable_fast_math, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info);
 
     const DataLayout   data_layout = input->info()->data_layout();
     const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    auto               out_dims    = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-                                                                     weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
+    auto               out_dims    = deconvolution_output_dimensions(
+                         input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+                         weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
@@ -157,32 +225,24 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     const unsigned int stride_y = info.stride().second;
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
-    _memory_group.manage(&_scaled_output);
 
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
 
     // setup the function to convolve the upscaled output
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    uint32_t            deconv_pad_x = 0;
-    uint32_t            deconv_pad_y = 0;
-
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
-                                                                              stride_x, stride_y,
-                                                                              out_dims, deconv_pad_x, deconv_pad_y);
+    uint32_t          deconv_pad_x    = 0;
+    uint32_t          deconv_pad_y    = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
 
     const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
-    scale_out_info.set_data_layout(data_layout);
-    _scaled_output.allocator()->init(scale_out_info);
-
-    _upsample_f.configure(input, &_scaled_output, upsample_info);
-
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+    // Do not perform upsampling when the operation uses unit stride in all dimensions
+    _do_upsampling = stride_x != 1 || stride_y != 1;
 
     // Setup flip axis data
     _flip_axis.allocator()->allocate();
@@ -190,7 +250,32 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     axis_data[0]   = static_cast<uint32_t>(width_idx);
     axis_data[1]   = static_cast<uint32_t>(height_idx);
 
-    _scaled_output.allocator()->allocate();
+    // Setup convolution and upsampling, if needed
+    if (_do_upsampling)
+    {
+        _memory_group.manage(&_scaled_output);
+
+        const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+        TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+        scale_out_info.set_data_layout(data_layout);
+        _scaled_output.allocator()->init(scale_out_info);
+
+        // Minor optimization: In the upsampling step, we do not need to allocate space for the padding in the upsampled image.
+        // The padding amount can be given as input to the convolution layer.
+        _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
+
+        _scaled_output.allocator()->allocate();
+    }
+    else
+    {
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
+    }
 }
 
 void NEDeconvolutionLayer::run()
@@ -199,13 +284,16 @@ void NEDeconvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    _upsample_f.run();
+    if (_do_upsampling)
+    {
+        _upsample_f.run();
+    }
     _conv_f.run();
 }
 
 void NEDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index a2f890ef95..766635dfa1 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,21 +23,52 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuCast.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+struct NEDepthConvertLayer::Impl
+{
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
+};
+
+NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>())
+{
+}
+NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&)            = default;
+NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer::~NEDepthConvertLayer()                                 = default;
 
 void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
+    ARM_COMPUTE_UNUSED(shift);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    ARM_COMPUTE_ERROR_ON(shift != 0);
+
+    _impl->op = std::make_unique<cpu::CpuCast>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
+}
+
+Status
+NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
+    return cpu::CpuCast::validate(input, output, policy);
 }
 
-Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayer::run()
 {
-    return NEDepthConvertLayerKernel::validate(input, output, policy, shift);
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
+    _impl->op->run(pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index 3569eecf0a..5eea4dca65 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,25 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
 namespace arm_compute
 {
+NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{}
+{
+}
+
+NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default;
+
 void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+
+    auto k = std::make_unique<NEDepthToSpaceLayerKernel>();
     k->configure(input, output, block_shape);
     _kernel = std::move(k);
 }
@@ -42,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo
 {
     return NEDepthToSpaceLayerKernel::validate(input, output, block_shape);
 }
+
+void NEDepthToSpaceLayer::run()
+{
+    NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension());
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 7214971044..6c085645db 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,542 +28,358 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDepthwiseConv2d.h"
+
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
-namespace
-{
-Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
-
-    if(biases != nullptr)
-    {
-        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type());
-
-    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
-    {
-        TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation));
-
-        if(is_quantized)
-        {
-            DirectConvolutionLayerOutputStageKernelInfo direct_conv_info;
-            direct_conv_info.output_data_type = input->data_type();
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, direct_conv_info));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
-    }
+NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
 
-    //Validate Activation Layer
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-    return Status{};
-}
-} // namespace
-
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
-      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
-      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
+{
+    ITensor                                 *src{nullptr};       // SRC_0
+    ITensor                                 *dst{nullptr};       // DST_0
+    const ITensor                           *weights{nullptr};   // SRC_1
+    const ITensor                           *biases{nullptr};    // SRC_2
+    Tensor                                   permuted_input{};   // INT_0
+    Tensor                                   permuted_weights{}; // INT_1
+    Tensor                                   permuted_output{};  // INT_2
+    Tensor                                   workspace{};        // INT_3
+    Tensor                                   packed_weights{};   // INT_4
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+    bool                                     is_prepared{false};
+    bool                                     permute{false};
+};
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_generic(ITensor                   *input,
-                                                                                                  const ITensor             *weights,
-                                                                                                  const ITensor             *biases,
-                                                                                                  ITensor                   *output,
-                                                                                                  const PadStrideInfo       &conv_info,
-                                                                                                  unsigned int               depth_multiplier,
-                                                                                                  const ActivationLayerInfo &act_info,
-                                                                                                  const Size2D              &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(
+    ITensor                   *input,
+    const ITensor             *weights,
+    const ITensor             *biases,
+    ITensor                   *output,
+    const PadStrideInfo       &conv_info,
+    unsigned int               depth_multiplier,
+    const ActivationLayerInfo &act_info,
+    const Size2D              &dilation)
 {
-    ARM_COMPUTE_UNUSED(act_info);
-
-    PixelValue zero_value(0.f);
-
-    // Initialize the intermediate accumulator tensor in case of quantized input
-    if(_is_quantized)
-    {
-        TensorShape accum_shape  = output->info()->tensor_shape();
-        DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_nchw)
-        {
-            permute(accum_shape, PermutationVector(1U, 2U, 0U));
-            accum_layout = DataLayout::NCHW;
-        }
-
-        _memory_group.manage(&_accumulator);
-        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
-        _accumulator.info()->set_data_layout(accum_layout);
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
-    }
-
-    if(!_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-        // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-        // Configure depthwise
-        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
+    bool is_nhwc   = input->info()->data_layout() == DataLayout::NCHW;
+    _impl->src     = input;
+    _impl->weights = weights;
+    _impl->biases  = biases;
+    _impl->dst     = output;
+    _impl->permute = is_nhwc;
 
-        // Configure border handler
-        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op->configure(_impl->src->info(), _impl->weights->info(),
+                         _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info);
 
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-    }
-    else
-    {
-        // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
+    // Configure pipeline
+    ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
+    const bool          is_relu                    = arm_compute::utils::info_helpers::is_relu(act_info);
+    const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
+    bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
 
-        // Configure border handler
-        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-    }
-
-    // Configure biases accumulation
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
-
-        float   multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-        int32_t output_multiplier;
-        int32_t output_shift;
-        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-
-        DirectConvolutionLayerOutputStageKernelInfo direct_conv_info;
-        direct_conv_info.result_fixedpoint_multiplier = output_multiplier;
-        direct_conv_info.result_shift                 = output_shift;
-        direct_conv_info.result_offset_after_shift    = oq_info.offset;
-        direct_conv_info.output_data_type             = input->info()->data_type();
-        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, direct_conv_info);
-        _accumulator.allocator()->allocate();
-    }
-    else if(_has_bias)
+    if (!is_activationlayer_enabled)
     {
-        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
+        act_info_to_use = act_info;
     }
+    info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation};
 
-    // Permute output
-    if(!_is_nchw)
-    {
-        // Configure the function to transform the convoluted output to NHWC
-        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-        _permuted_output.allocator()->allocate();
-    }
-}
+    auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_optimized(const ITensor             *input,
-                                                                                                    const ITensor             *weights,
-                                                                                                    const ITensor             *biases,
-                                                                                                    ITensor                   *output,
-                                                                                                    const PadStrideInfo       &conv_info,
-                                                                                                    unsigned int               depth_multiplier,
-                                                                                                    const ActivationLayerInfo &act_info,
-                                                                                                    const Size2D              &dilation)
-{
-    ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
-    const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
-    const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
-    _is_activationlayer_enabled         = act_info.enabled() && !(is_relu || is_relu6);
-    if(!_is_activationlayer_enabled)
+    if (is_nhwc)
     {
-        act_info_to_use = act_info;
-    }
+        auto permute_input   = std::make_unique<cpu::CpuPermute>();
+        auto permute_weights = std::make_unique<cpu::CpuPermute>();
+        auto permute_output  = std::make_unique<cpu::CpuPermute>();
 
-    if(_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
+        _memory_group.manage(&_impl->permuted_input);
+        _memory_group.manage(&_impl->permuted_weights);
+        _memory_group.manage(&_impl->permuted_output);
 
         // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
 
         // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
 
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         // Configure optimized depthwise
-        _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
+        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(),
+                                      info);
 
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
 
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
+        _impl->permuted_input.allocator()->allocate();
+        _impl->permuted_output.allocator()->allocate();
     }
     else
     {
-        _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
-    }
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
-                                                                                          const ITensor *weights,
-                                                                                          const ITensor *biases,
-                                                                                          ITensor *output, const PadStrideInfo &conv_info,
-                                                                                          unsigned int               depth_multiplier,
-                                                                                          const ActivationLayerInfo &act_info,
-                                                                                          const Size2D              &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
-
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias         = biases != nullptr;
-    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
-                                                                                       weights->info(),
-                                                                                       conv_info,
-                                                                                       depth_multiplier,
-                                                                                       dilation);
-    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
-    _permute                    = _is_optimized == _is_nchw;
-    _is_prepared                = false;
-    _is_activationlayer_enabled = act_info.enabled();
-
-    // Configure appropriate pipeline
-    if(_is_optimized)
-    {
-        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-    else
-    {
-        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-
-    // Configure activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-}
-
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
-                                                                                           const ITensorInfo         *weights,
-                                                                                           const ITensorInfo         *biases,
-                                                                                           const ITensorInfo         *output,
-                                                                                           const PadStrideInfo       &conv_info,
-                                                                                           unsigned int               depth_multiplier,
-                                                                                           const ActivationLayerInfo &act_info,
-                                                                                           const Size2D              &dilation)
-{
-    return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_generic()
-{
-    // Fill border
-    NEScheduler::get().schedule(&_border_handler, Window::DimX);
-
-    // Execute depthwise convolution
-    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
-
-    // Add biases
-    if(_has_bias || _is_quantized)
-    {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
+        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
     }
 
-    // Permute output
-    if(!_is_nchw)
-    {
-        _permute_output.run();
-    }
+    // Allocate memory based on the internal memory requirements
+    experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
+    _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8),
+                                       mem_req[0].alignment);
+    _impl->packed_weights.allocator()->init(
+        TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment);
+    _memory_group.manage(&_impl->workspace);
+    _memory_group.manage(&_impl->packed_weights);
+    _impl->workspace.allocator()->allocate();
+    _impl->packed_weights.allocator()->allocate();
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_optimized()
+Status
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo   *input,
+                                                                                    const ITensorInfo   *weights,
+                                                                                    const ITensorInfo   *biases,
+                                                                                    const ITensorInfo   *output,
+                                                                                    const PadStrideInfo &conv_info,
+                                                                                    unsigned int depth_multiplier,
+                                                                                    const ActivationLayerInfo &act_info,
+                                                                                    const Size2D              &dilation)
 {
-    // Run assembly function
-    _dwc_optimized_func.run();
-
-    // Permute output
-    if(_is_nchw)
-    {
-        _permute_output.run();
-    }
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
 {
     prepare();
-
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    // Permute input
-    if(_permute)
-    {
-        _permute_input.run();
-    }
-
-    _is_optimized ? run_optimized() : run_generic();
-
-    // Run activation
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+    pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+    pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+    pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
+    pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
         // Permute weights
-        if(_permute)
+        if (_impl->permute)
         {
-            _permuted_weights.allocator()->allocate();
-            _permute_weights.run();
-            _original_weights->mark_as_unused();
+            _impl->permuted_weights.allocator()->allocate();
         }
 
-        // Prepare optimized function
-        if(_is_optimized)
+        if (!_impl->permuted_weights.is_used())
         {
-            _dwc_optimized_func.prepare();
-            if(!_permuted_weights.is_used())
-            {
-                _permuted_weights.allocator()->free();
-            }
+            _impl->permuted_weights.allocator()->free();
         }
 
-        _is_prepared = true;
+        _impl->is_prepared = true;
     }
 }
 
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
+{
+    Tensor                                   permuted_input{};
+    Tensor                                   permuted_weights{};
+    Tensor                                   permuted_output{};
+    bool                                     is_prepared{false};
+    bool                                     is_nchw{false};
+    bool                                     is_activationlayer_enabled{false};
+    const ITensor                           *weights{nullptr};
+    const ITensor                           *biases{nullptr};
+    const ITensor                           *src{nullptr};
+    ITensor                                 *dst{nullptr};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+};
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
-    : _depthwise_conv_kernel(), _fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(),
-      _is_prepared(false), _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+    : _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor             *input,
+                                                                                const ITensor       *weights,
+                                                                                const ITensor       *biases,
+                                                                                ITensor             *output,
+                                                                                const PadStrideInfo &conv_info,
+                                                                                unsigned int         depth_multiplier,
+                                                                                const ActivationLayerInfo &act_info,
+                                                                                const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                     output->info(), conv_info, depth_multiplier, act_info, dilation));
 
-    _is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
-    _is_prepared = !_is_nchw;
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
+    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(),
+                         info);
+
+    _impl->src         = input;
+    _impl->dst         = output;
+    _impl->weights     = weights;
+    _impl->biases      = biases;
+    _impl->is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
+    _impl->is_prepared = !_impl->is_nchw;
 
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
-    if(_is_nchw)
+    if (_impl->is_nchw)
     {
-        _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
-        _permuted_input.info()->set_data_layout(DataLayout::NHWC);
-        input_to_use = &_permuted_input;
+        auto permute_input   = std::make_unique<cpu::CpuPermute>();
+        auto permute_weights = std::make_unique<cpu::CpuPermute>();
+
+        permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
+        input_to_use = &_impl->permuted_input;
 
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
-        weights_to_use = &_permuted_weights;
+        permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
+        _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
+        weights_to_use = &_impl->permuted_weights;
 
-        _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
-        output_to_use = &_permuted_output;
+        _impl->permuted_output.allocator()->init(
+            output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+        output_to_use = &_impl->permuted_output;
     }
-    _original_weights = weights_to_use;
 
-    _depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
-    _fill_border.configure(input_to_use, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()));
+    auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
+    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(),
+                                     biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
-    if(_is_nchw)
+    if (_impl->is_nchw)
     {
-        _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-        _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+        auto permute_output = std::make_unique<cpu::CpuPermute>();
+        permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
+        _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
 
-        _permuted_input.allocator()->allocate();
-        _permuted_weights.allocator()->allocate();
-        _permuted_output.allocator()->allocate();
-    }
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        _impl->permuted_input.allocator()->allocate();
+        _impl->permuted_weights.allocator()->allocate();
+        _impl->permuted_output.allocator()->allocate();
     }
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo   *input,
+                                                                                 const ITensorInfo   *weights,
+                                                                                 const ITensorInfo   *biases,
+                                                                                 const ITensorInfo   *output,
                                                                                  const PadStrideInfo &conv_info,
-                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+                                                                                 unsigned int         depth_multiplier,
+                                                                                 const ActivationLayerInfo &act_info,
+                                                                                 const Size2D              &dilation)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        TensorShape permuted_input_shape   = input->tensor_shape();
-        TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
-        permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
-
-        const TensorInfo permuted_input   = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, dilation));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, dilation));
-    }
-
-    // Validate Activation Layer
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-
-    return Status{};
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
 {
-    if(_is_nchw)
-    {
-        prepare();
-        _permute_input.run();
-    }
-
-    NEScheduler::get().schedule(&_fill_border, Window::DimX);
-    NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
-
-    if(_is_nchw)
-    {
-        _permute_output.run();
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::prepare()
-{
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        _permute_weights.run();
-        _original_weights->mark_as_unused();
-        _is_prepared = true;
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
+    pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
+    pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
+    pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
+    pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
+
+    _impl->op->run(pack);
 }
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _depth_conv_func(DepthwiseConvolutionFunction::GENERIC), _func_optimized(std::move(memory_manager)), _func_generic()
+    : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                            const ActivationLayerInfo &act_info, const Size2D &dilation)
+#ifndef DOXYGEN_SKIP_THIS
+struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    _depth_conv_func = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info, dilation);
-    switch(_depth_conv_func)
-    {
-        case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-            break;
-        case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
-    }
-}
-
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+    DepthwiseConvolutionFunction                 depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED};
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr};
+    NEDepthwiseConvolutionLayerGeneric           func_generic{};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{nullptr};
+};
+#endif // DOXYGEN_SKIP_THIS
+
+void NEDepthwiseConvolutionLayer::configure(ITensor                   *input,
+                                            const ITensor             *weights,
+                                            const ITensor             *biases,
+                                            ITensor                   *output,
+                                            const PadStrideInfo       &conv_info,
+                                            unsigned int               depth_multiplier,
+                                            const ActivationLayerInfo &act_info,
+                                            const Size2D              &dilation)
 {
-    DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    switch(depth_conv_func)
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info,
+        depth_multiplier, act_info, dilation));
+
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
+    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info);
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            return NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                            dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            return NEDepthwiseConvolutionLayerGeneric::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                          dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
-DepthwiseConvolutionFunction NEDepthwiseConvolutionLayer::get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                                            const PadStrideInfo &conv_info,
-                                                                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo         *input,
+                                             const ITensorInfo         *weights,
+                                             const ITensorInfo         *biases,
+                                             const ITensorInfo         *output,
+                                             const PadStrideInfo       &conv_info,
+                                             unsigned int               depth_multiplier,
+                                             const ActivationLayerInfo &act_info,
+                                             const Size2D              &dilation)
 {
-    if(bool(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)))
-    {
-        return DepthwiseConvolutionFunction::OPTIMIZED;
-    }
-    else
-    {
-        return DepthwiseConvolutionFunction::GENERIC;
-    }
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
 {
-    switch(_depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.run();
+            _impl->func_optimized.run();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.run();
+            _impl->func_generic.run();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
@@ -572,13 +388,13 @@ void NEDepthwiseConvolutionLayer::run()
 
 void NEDepthwiseConvolutionLayer::prepare()
 {
-    switch(_depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _func_optimized.prepare();
+            _impl->func_optimized.prepare();
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _func_generic.prepare();
+            _impl->func_generic.prepare();
             break;
         default:
             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 42a0ee0895..28d19d2950 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,43 @@
 
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/cpu/operators/CpuDequantize.h"
 
 namespace arm_compute
 {
+struct NEDequantizationLayer::Impl
+{
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuDequantize> op{nullptr};
+};
+
+NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>())
+{
+}
+NEDequantizationLayer::~NEDequantizationLayer() = default;
+
 void NEDequantizationLayer::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDequantizationLayerKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuDequantize>();
+    _impl->op->configure(input->info(), output->info());
 }
 
 Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return NEDequantizationLayerKernel::validate(input, output);
+    return cpu::CpuDequantize::validate(input, output);
+}
+
+void NEDequantizationLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
deleted file mode 100644
index 81180307f6..0000000000
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEDerivative::NEDerivative()
-    : _kernel(), _border_handler()
-{
-}
-
-void NEDerivative::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
-}
-
-void NEDerivative::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-    NEScheduler::get().schedule(&_kernel, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
index 0d97689d0a..b347390162 100644
--- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/common/utils/Log.h"
+
 #include <cstddef>
 #include <ios>
 #include <list>
@@ -34,23 +36,36 @@
 namespace arm_compute
 {
 NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+    : _memory_group(std::move(memory_manager)),
+      _dequantize(),
+      _detection_post_process(),
+      _decoded_scores(),
+      _run_dequantize(false)
 {
 }
 
-void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
-                                            ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void NEDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                            const ITensor                *input_scores,
+                                            const ITensor                *input_anchors,
+                                            ITensor                      *output_boxes,
+                                            ITensor                      *output_classes,
+                                            ITensor                      *output_scores,
+                                            ITensor                      *num_detection,
+                                            DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
-                                                                     output_scores->info(),
-                                                                     num_detection->info(), info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+                           num_detection, info);
 
     const ITensor                *input_scores_to_use = input_scores;
     DetectionPostProcessLayerInfo info_to_use         = info;
     _run_dequantize                                   = is_data_type_quantized(input_box_encoding->info()->data_type());
 
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _memory_group.manage(&_decoded_scores);
 
@@ -59,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c
         input_scores_to_use = &_decoded_scores;
 
         // Create a new info struct to avoid dequantizing in the CPP layer
-        std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
-        DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
-                                                     scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+        std::array<float, 4>          scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(),
+                                           info.scale_value_w()};
+        DetectionPostProcessLayerInfo info_quantized(
+            info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(),
+            info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false);
         info_to_use = info_quantized;
     }
 
-    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes,
+                                      output_classes, output_scores, num_detection, info_to_use);
     _decoded_scores.allocator()->allocate();
 }
 
-Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
-                                             ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                             const ITensorInfo            *input_scores,
+                                             const ITensorInfo            *input_anchors,
+                                             ITensorInfo                  *output_boxes,
+                                             ITensorInfo                  *output_classes,
+                                             ITensorInfo                  *output_scores,
+                                             ITensorInfo                  *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
     bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
-    if(run_dequantize)
+    if (run_dequantize)
     {
         TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
         ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors,
+                                                                       output_boxes, output_classes, output_scores,
+                                                                       num_detection, info));
 
     return Status{};
 }
@@ -88,7 +114,7 @@ void NEDetectionPostProcessLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Decode scores if necessary
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _dequantize.run();
     }
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
deleted file mode 100644
index 449147dbe2..0000000000
--- a/src/runtime/NEON/functions/NEDilate.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDilate.h"
-
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEDilateKernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 751a3fa1fb..f1c2cf969f 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,95 +28,58 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-#include <cmath>
-#include <tuple>
+#include "src/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
+struct NEDirectConvolutionLayer::Impl
+{
+    ITensor                              *src{nullptr};
+    const ITensor                        *weights{nullptr};
+    const ITensor                        *bias{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr};
+};
+
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
+    : _memory_manager(std::move(memory_manager)), _impl(std::make_unique<Impl>())
 {
 }
-
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
+
+void NEDirectConvolutionLayer::configure(ITensor                   *input,
+                                         const ITensor             *weights,
+                                         const ITensor             *bias,
+                                         ITensor                   *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
-
-    // Free accumulator
-    if(_accumulator.buffer() != nullptr)
-    {
-        _accumulator.allocator()->free();
-    }
-
-    _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
-
-    // Check if bias should be added in the convolution result
-    _has_bias = (bias != nullptr);
-
-    _conv_kernel.configure(input, weights, output, conv_info);
-    if(_has_bias)
-    {
-        _output_stage_kernel.configure(output, bias);
-    }
-
-    // Add zero padding XY
-    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
+    _impl->src     = input;
+    _impl->weights = weights;
+    _impl->bias    = bias;
+    _impl->dst     = output;
+    _impl->op      = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
+    _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(),
+                         conv_info, act_info);
 }
 
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status NEDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-
-    // output might not be initialized since it can be an intermediate tensor of another layer
-    DataType   data_type = input->data_type();
-    TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
-
-    // Validate Convolution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of input feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
-    }
-
-    // Validate bias kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
-
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
-    }
-
-    return Status{};
+    return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
 }
 
 void NEDirectConvolutionLayer::run()
 {
-    NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_conv_kernel, _dim_split);
-    if(_has_bias)
-    {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
+    pack.add_tensor(TensorType::ACL_SRC_2, _impl->bias);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
new file mode 100644
index 0000000000..685ef2d4d7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuElementwise.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+struct NEElementwiseMax::Impl
+{
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr};
+};
+
+NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>())
+{
+}
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&)            = default;
+NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
+NEElementwiseMax::~NEElementwiseMax()                              = default;
+
+void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuElementwiseMax>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return cpu::CpuElementwiseMax::validate(input1, input2, output);
+}
+
+void NEElementwiseMax::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseMin::Impl
+{
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr};
+};
+
+NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>())
+{
+}
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&)            = default;
+NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
+NEElementwiseMin::~NEElementwiseMin()                              = default;
+
+void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuElementwiseMin>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return cpu::CpuElementwiseMin::validate(input1, input2, output);
+}
+
+void NEElementwiseMin::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseSquaredDiff::Impl
+{
+    const ITensor                                  *src_0{nullptr};
+    const ITensor                                  *src_1{nullptr};
+    ITensor                                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr};
+};
+
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
+{
+}
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&)            = default;
+NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff()                                      = default;
+
+void NEElementwiseSquaredDiff::configure(ITensor                   *input1,
+                                         ITensor                   *input2,
+                                         ITensor                   *output,
+                                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuElementwiseSquaredDiff>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output);
+}
+
+void NEElementwiseSquaredDiff::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseDivision::Impl
+{
+    const ITensor                               *src_0{nullptr};
+    const ITensor                               *src_1{nullptr};
+    ITensor                                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr};
+};
+
+NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>())
+{
+}
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&)            = default;
+NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::~NEElementwiseDivision()                                   = default;
+
+void NEElementwiseDivision::configure(ITensor                   *input1,
+                                      ITensor                   *input2,
+                                      ITensor                   *output,
+                                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuElementwiseDivision>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseDivision::validate(const ITensorInfo         *input1,
+                                       const ITensorInfo         *input2,
+                                       const ITensorInfo         *output,
+                                       const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return cpu::CpuElementwiseDivision::validate(input1, input2, output);
+}
+
+void NEElementwiseDivision::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwisePower::Impl
+{
+    const ITensor                            *src_0{nullptr};
+    const ITensor                            *src_1{nullptr};
+    ITensor                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwisePower> op{nullptr};
+};
+
+NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>())
+{
+}
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&)            = default;
+NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
+NEElementwisePower::~NEElementwisePower()                                = default;
+
+void NEElementwisePower::configure(ITensor                   *input1,
+                                   ITensor                   *input2,
+                                   ITensor                   *output,
+                                   const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuElementwisePower>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return cpu::CpuElementwisePower::validate(input1, input2, output);
+}
+
+void NEElementwisePower::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+template <ComparisonOperation COP>
+struct NEElementwiseComparisonStatic<COP>::Impl
+{
+    const ITensor                                            *src_0{nullptr};
+    const ITensor                                            *src_1{nullptr};
+    ITensor                                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr};
+};
+
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>())
+{
+}
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP> &
+NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuElementwiseComparisonStatic<COP>>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+template <ComparisonOperation COP>
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1,
+                                                    const ITensorInfo *input2,
+                                                    const ITensorInfo *output)
+{
+    return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseComparison::Impl
+{
+    const ITensor                                 *src_0{nullptr};
+    const ITensor                                 *src_1{nullptr};
+    ITensor                                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr};
+};
+
+NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>())
+{
+}
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&)            = default;
+NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::~NEElementwiseComparison()                                     = default;
+
+void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuElementwiseComparison>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), op);
+}
+
+Status NEElementwiseComparison::validate(const ITensorInfo  *input1,
+                                         const ITensorInfo  *input2,
+                                         const ITensorInfo  *output,
+                                         ComparisonOperation op)
+{
+    return cpu::CpuElementwiseComparison::validate(input1, input2, output, op);
+}
+
+void NEElementwiseComparison::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+// Supported Specializations
+template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
deleted file mode 100644
index 926ae1fa21..0000000000
--- a/src/runtime/NEON/functions/NEElementwiseOperators.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h>
-
-#include "arm_compute/core/ITensor.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::MAX, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
-}
-
-void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::MIN, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
-}
-
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
-}
-
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEDivisionOperationKernel::validate(input1, input2, output);
-}
-
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEPowerOperationKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEPowerOperationKernel::validate(input1, input2, output);
-}
-
-template <ComparisonOperation COP>
-void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
-    k->configure(COP, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-template <ComparisonOperation COP>
-Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return NEComparisonOperationKernel::validate(COP, input1, input2, output);
-}
-
-void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
-    k->configure(op, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
-{
-    return NEComparisonOperationKernel::validate(op, input1, input2, output);
-}
-
-// Supported Specializations
-template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
-} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index 80db027398..23a092c407 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,88 +23,63 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
-#include "support/MemorySupport.h"
+#include "src/cpu/operators/CpuElementwiseUnary.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void NERsqrtLayer::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
-    k->configure(ElementWiseUnary::RSQRT, input, output);
-    _kernel = std::move(k);
-}
-Status NERsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::RSQRT, input, output);
-}
+using OperatorType = cpu::CpuElementwiseUnary;
 
-void NEExpLayer::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
-    k->configure(ElementWiseUnary::EXP, input, output);
-    _kernel = std::move(k);
-}
-Status NEExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+template <ElementWiseUnary op>
+struct NEElementwiseUnaryLayer<op>::Impl
 {
-    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::EXP, input, output);
-}
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> cpu_op{nullptr};
+};
 
-void NENegLayer::configure(const ITensor *input, ITensor *output)
+template <ElementWiseUnary op>
+NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>())
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
-    k->configure(ElementWiseUnary::NEG, input, output);
-    _kernel = std::move(k);
-}
-Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::NEG, input, output);
 }
+template <ElementWiseUnary op>
+NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default;
+template <ElementWiseUnary op>
+NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default;
+template <ElementWiseUnary op>
+NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default;
 
-void NELogLayer::configure(const ITensor *input, ITensor *output)
+template <ElementWiseUnary op>
+void NEElementwiseUnaryLayer<op>::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
-    k->configure(ElementWiseUnary::LOG, input, output);
-    _kernel = std::move(k);
-}
-Status NELogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::LOG, input, output);
+    _impl->src    = input;
+    _impl->dst    = output;
+    _impl->cpu_op = std::make_unique<OperatorType>();
+    _impl->cpu_op->configure(op, *_impl->src->info(), *_impl->dst->info());
 }
 
-void NEAbsLayer::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
-    k->configure(ElementWiseUnary::ABS, input, output);
-    _kernel = std::move(k);
-}
-Status NEAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+template <ElementWiseUnary op>
+Status NEElementwiseUnaryLayer<op>::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::ABS, input, output);
+    return OperatorType::validate(op, *input, *output);
 }
 
-void NERoundLayer::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
-    k->configure(ElementWiseUnary::ROUND, input, output);
-    _kernel = std::move(k);
-}
-Status NERoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+template <ElementWiseUnary op>
+void NEElementwiseUnaryLayer<op>::run()
 {
-    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::ROUND, input, output);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->cpu_op->run(pack);
 }
 
-void NESinLayer::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
-    k->configure(ElementWiseUnary::SIN, input, output);
-    _kernel = std::move(k);
-}
-Status NESinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return NEElementwiseUnaryKernel::validate(ElementWiseUnary::SIN, input, output);
-}
+template class NEElementwiseUnaryLayer<ElementWiseUnary::RSQRT>;
+template class NEElementwiseUnaryLayer<ElementWiseUnary::EXP>;
+template class NEElementwiseUnaryLayer<ElementWiseUnary::NEG>;
+template class NEElementwiseUnaryLayer<ElementWiseUnary::LOG>;
+template class NEElementwiseUnaryLayer<ElementWiseUnary::ABS>;
+template class NEElementwiseUnaryLayer<ElementWiseUnary::ROUND>;
+template class NEElementwiseUnaryLayer<ElementWiseUnary::SIN>;
 
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
deleted file mode 100644
index 70b93cae9e..0000000000
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEEqualizeHistogram::NEEqualizeHistogram()
-    : _histogram_kernel(), _cd_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
-{
-}
-
-void NEEqualizeHistogram::configure(const IImage *input, IImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Configure kernels
-    _histogram_kernel.configure(input, &_hist);
-    _cd_histogram_kernel.configure(input, &_hist, &_cum_dist, &_cd_lut);
-    _map_histogram_kernel.configure(input, &_cd_lut, output);
-}
-
-void NEEqualizeHistogram::run()
-{
-    // Calculate histogram of input.
-    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
-
-    // Calculate cumulative distribution of histogram and create LUT.
-    NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY);
-
-    // Map input to output using created LUT.
-    NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
deleted file mode 100644
index 4f773b7091..0000000000
--- a/src/runtime/NEON/functions/NEErode.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEErode.h"
-
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEErodeKernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 25ba1c8391..fb75f9da29 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,28 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/utils/helpers/fft.h"
+
 namespace arm_compute
 {
+NEFFT1D::~NEFFT1D() = default;
+
 NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+    : _memory_group(std::move(memory_manager)),
+      _digit_reverse_kernel(),
+      _fft_kernels(),
+      _scale_kernel(),
+      _digit_reversed_input(),
+      _digit_reverse_indices(),
+      _num_ffts(0),
+      _axis(0),
+      _run_scale(false)
 {
 }
 
@@ -39,6 +54,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config));
+    ARM_COMPUTE_LOG_PARAMS(input, output, config);
 
     // Decompose size to radix factors
     const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
@@ -58,7 +74,8 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel = std::make_unique<NEFFTDigitReverseKernel>();
+    _digit_reverse_kernel->configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
@@ -66,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
     _fft_kernels.resize(_num_ffts);
     _axis = config.axis;
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -75,18 +92,22 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
         fft_kernel_info.radix          = radix_for_stage;
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
-        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels[i]                = std::make_unique<NEFFTRadixStageKernel>();
+        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr,
+                                   fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+        _scale_kernel          = std::make_unique<NEFFTScaleKernel>();
+        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config)
+                        : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -103,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
@@ -112,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         // All combinations are supported except real input with real output (i.e., both input channels set to 1)
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
@@ -128,17 +149,17 @@ void NEFFT1D::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ));
+    NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
-        NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX));
+        NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
-        NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+        NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index 2fea017781..066909221d 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,10 +27,17 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Scheduler.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
+NEFFT2D::~NEFFT2D() = default;
+
 NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -38,6 +45,7 @@ void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo &
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config));
+    ARM_COMPUTE_LOG_PARAMS(input, output, config);
 
     // Setup first pass
     FFT1DInfo first_pass_config;
@@ -74,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 08230074c3..94f85e5ffa 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,17 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
 {
@@ -39,11 +47,11 @@ int pad_decomposable(int N)
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -93,10 +101,19 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
       _is_prepared(false)
 {
 }
-
-void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info)
+NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
+
+void NEFFTConvolutionLayer::configure(ITensor                   *input,
+                                      const ITensor             *weights,
+                                      const ITensor             *biases,
+                                      ITensor                   *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
+    ARM_COMPUTE_UNUSED(enable_fast_math);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+
     _original_weights = weights;
     _original_bias    = biases;
 
@@ -104,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -126,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -147,18 +167,18 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
-    _transform_weights_func = support::cpp14::make_unique<NEFFT2D>();
+    _transform_weights_func = std::make_unique<NEFFT2D>();
     _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -182,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -194,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top),
+                                   Coordinates(end_right, end_botton));
     _reshaped_output.allocator()->allocate();
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -224,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -236,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(output, nullptr, act_info);
     }
@@ -249,9 +273,16 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     axis_data[1]   = 1;
 }
 
-Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info)
+Status NEFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
+    ARM_COMPUTE_UNUSED(enable_fast_math);
+
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
@@ -266,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
@@ -278,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
         }
@@ -300,7 +334,7 @@ void NEFFTConvolutionLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -318,17 +352,17 @@ void NEFFTConvolutionLayer::run()
     _extract_output_func.run();
 
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -336,10 +370,10 @@ void NEFFTConvolutionLayer::run()
 
 void NEFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -349,7 +383,7 @@ void NEFFTConvolutionLayer::prepare()
         const ITensor *cur_weights = _original_weights;
 
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
deleted file mode 100644
index af3530151c..0000000000
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NEFastCorners::NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _fast_corners_kernel(),
-      _border_handler(),
-      _nonmax_kernel(),
-      _fill_kernel(),
-      _output(),
-      _suppressed(),
-      _non_max(false)
-{
-}
-
-void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
-                              BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == corners);
-    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
-
-    _non_max = nonmax_suppression;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
-    _output.allocator()->init(tensor_info);
-    _memory_group.manage(&_output);
-
-    // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
-    // width - 3) and ywindow (3, height -3) so the output image will leave the
-    // pixels on the borders unchanged. This is reflected in the valid region
-    // of the output. The non maxima suppression is only run on the valid
-    // pixels.
-    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
-    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
-
-    if(!_non_max)
-    {
-        _fill_kernel.configure(&_output, 1 /* we keep all texels >0 */, corners);
-    }
-    else
-    {
-        _suppressed.allocator()->init(tensor_info);
-        _memory_group.manage(&_suppressed);
-        _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
-        _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
-
-        // Allocate intermediate tensors
-        _suppressed.allocator()->allocate();
-    }
-
-    // Allocate intermediate tensors
-    _output.allocator()->allocate();
-}
-
-void NEFastCorners::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
-
-    if(_non_max)
-    {
-        NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
-    }
-
-    NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index d507f7c88f..bc1d5b7f5c 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,40 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
 
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuFill.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct NEFill::Impl
+{
+    ITensor                      *tensor{nullptr};
+    std::unique_ptr<cpu::CpuFill> op{nullptr};
+};
+
+NEFill::NEFill() : _impl(std::make_unique<Impl>())
+{
+}
+NEFill::NEFill(NEFill &&)            = default;
+NEFill &NEFill::operator=(NEFill &&) = default;
+NEFill::~NEFill()                    = default;
+
 void NEFill::configure(ITensor *tensor, PixelValue constant_value)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEMemsetKernel>();
-    k->configure(tensor, constant_value);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+
+    _impl->tensor = tensor;
+    _impl->op     = std::make_unique<cpu::CpuFill>();
+    _impl->op->configure(tensor->info(), constant_value);
+}
+
+void NEFill::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_DST, _impl->tensor);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index 6b7a0faa85..a3ab9c3db4 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,15 +26,27 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
 namespace arm_compute
 {
-void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+NEFillBorder::NEFillBorder() : _border_handler(nullptr)
+{
+}
+
+void NEFillBorder::configure(ITensor          *input,
+                             unsigned int      border_width,
+                             BorderMode        border_mode,
+                             const PixelValue &constant_border_value)
 {
-    _border_handler.configure(input, BorderSize(border_width), border_mode, constant_border_value);
+    ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value);
+    _border_handler = std::make_unique<NEFillBorderKernel>();
+    _border_handler->configure(input, BorderSize(border_width), border_mode, constant_border_value);
 }
 
 void NEFillBorder::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index a28411c4e9..56db2be3fa 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,21 +23,57 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/Size2D.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/operators/CpuFlatten.h"
 
 namespace arm_compute
 {
+struct NEFlattenLayer::Impl
+{
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuFlatten> op{nullptr};
+};
+
+NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>())
+{
+}
+NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&)            = default;
+NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default;
+NEFlattenLayer::~NEFlattenLayer()                            = default;
+
 void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEFlattenLayerKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    _impl->src = input;
+    _impl->dst = output;
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
+
+    _impl->op = std::make_unique<cpu::CpuFlatten>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info());
 }
 
 Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return NEFlattenLayerKernel::validate(input, output);
+    // Checks performed when output is configured
+    if (output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+    }
+    return cpu::CpuFlatten::validate(input, output);
+}
+void NEFlattenLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index 98b9725329..112c93c478 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,20 +23,47 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuFloor.h"
 
 namespace arm_compute
 {
+struct NEFloor::Impl
+{
+    const ITensor                 *src{nullptr};
+    ITensor                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuFloor> op{nullptr};
+};
+
+NEFloor::NEFloor() : _impl(std::make_unique<Impl>())
+{
+}
+NEFloor::NEFloor(NEFloor &&)            = default;
+NEFloor &NEFloor::operator=(NEFloor &&) = default;
+NEFloor::~NEFloor()                     = default;
+
 void NEFloor::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEFloorKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = std::make_unique<cpu::CpuFloor>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info());
 }
 
 Status NEFloor::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return NEFloorKernel::validate(input, output);
+    return cpu::CpuFloor::validate(input, output);
+}
+
+void NEFloor::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index e275bca2f9..2656d0fa0f 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,466 +23,138 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
-#include <algorithm>
-#include <cmath>
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuFullyConnected.h"
 
 namespace arm_compute
 {
-using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
 
-namespace
+struct NEFullyConnectedLayer::Impl
 {
-// Get min, max bound of a quantized assymetric output tensor, with the effect of fused activation
-std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
-{
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(data_type);
-    const UniformQuantizationInfo q_unif = q_info.uniform();
-
-    if(act_info.enabled())
-    {
-        switch(act_info.activation())
-        {
-            case ActivationLayerInfo::ActivationFunction::RELU:
-                type_min = PixelValue(q_unif.offset);
-                break;
-            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                type_min = PixelValue(q_unif.offset);
-                type_max = PixelValue(act_info.a(), data_type, q_info);
-                break;
-            case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                type_min = PixelValue(act_info.b(), data_type, q_info);
-                type_max = PixelValue(act_info.a(), data_type, q_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Activation function not supported.");
-                break;
-        }
-    }
-
-    return std::make_pair(type_min, type_max);
-}
-
-Status get_gemmlowp_output_stage_info(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
-{
-    const auto                    data_type = input->data_type();
-    const QuantizationInfo        oq_info   = output->quantization_info();
-    const UniformQuantizationInfo iq_unif   = input->quantization_info().uniform();
-    const UniformQuantizationInfo wq_unif   = weights->quantization_info().uniform();
-    const UniformQuantizationInfo oq_unif   = oq_info.uniform();
-
-    float   multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
-    int32_t output_multiplier;
-    int32_t output_shift;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
-
-    gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
-    gemmlowp_output_stage_info.gemmlowp_shift      = output_shift;
-    gemmlowp_output_stage_info.gemmlowp_offset     = oq_unif.offset;
-    gemmlowp_output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    gemmlowp_output_stage_info.gemmlowp_min_bound  = type_min.get<int32_t>();
-    gemmlowp_output_stage_info.gemmlowp_max_bound  = type_max.get<int32_t>();
-
-    return Status{};
-}
-
-Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act)
-{
-    if(is_data_type_quantized_asymmetric(input->data_type()))
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info(input->quantization_info().uniform().scale, -input->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
-
-        GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
-        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(input, weights, output, act, gemmlowp_output_stage_info));
-
-        GEMMInfo gemm_info;
-        gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
-
-        // Validate gemmlowp function
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_quantization_info(input_quantization_info),
-                                                                           &weights->clone()->set_quantization_info(weights_quantization_info),
-                                                                           biases,
-                                                                           output,
-                                                                           gemm_info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(input, weights, biases, output, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
-    }
-
-    return Status{};
-}
-} // namespace
-
-void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return NETransposeKernel::validate(input, output);
-}
-
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten_kernel(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(),
-      _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(nullptr, weights_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(),
-      _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), _is_quantized_asymmetric(false), _is_prepared(false)
-{
-}
-
-void NEFullyConnectedLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
-{
-    if(_is_quantized_asymmetric)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
-        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
-        // Configure gemmlowp function and output stage for asymmetric quantized types
-        GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
-        const Status            status = get_gemmlowp_output_stage_info(input->info(), weights->info(), output->info(), act, gemmlowp_output_stage_info);
-        ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
-
-        GEMMInfo gemm_info;
-        gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
-        gemm_info.set_activation_info(act);
-        _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
-
-        // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
-        input->info()->set_quantization_info(input_quantization_info);
-        weights->info()->set_quantization_info(weights_quantization_info);
-    }
-    else
-    {
-        // Configure matrix multiply kernel
-        GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */);
-        gemm_info.set_activation_info(act);
-        _mm_gemm.configure(input, weights, biases, output, 1.f, 1.0f, gemm_info);
-    }
-}
-
-void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
-{
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+    MemoryGroup      memory_group{};
+    IWeightsManager *weights_manager{nullptr};
 
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+    std::unique_ptr<cpu::CpuFullyConnected> op{nullptr};
 
-    // Initialize output tensor for flatten
-    TensorShape shape_flatten = compute_flatten_shape(input->info());
-    _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+    const ITensor *original_weights{nullptr};
 
-    // Configure flatten kernel
-    _memory_group.manage(&_flatten_output);
-    _flatten_kernel.configure(input, &_flatten_output);
+    ITensorPack                      run_pack{};
+    WorkspaceData<Tensor>            workspace{};
+    experimental::MemoryRequirements aux_mem_req{};
 
-    // Configure matrix multiply kernel
-    configure_mm(&_flatten_output, weights, biases, output, act);
+    bool is_prepared{false};
+    bool dynamic_weights{false};
+};
 
-    // Allocate the output tensor for flatten once all the configure methods have been called
-    _flatten_output.allocator()->allocate();
-}
+NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
 
-void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
+    : _impl(std::make_unique<Impl>())
 {
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-    // Configure matrix multiply kernel
-    configure_mm(input, weights, biases, output, act);
+    _impl->memory_group    = MemoryGroup(std::move(memory_manager));
+    _impl->weights_manager = weights_manager;
 }
 
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                                      FullyConnectedLayerInfo fc_info)
+void NEFullyConnectedLayer::configure(const ITensor          *input,
+                                      const ITensor          *weights,
+                                      const ITensor          *biases,
+                                      ITensor                *output,
+                                      FullyConnectedLayerInfo fc_info,
+                                      const WeightsInfo      &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
+    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(),
                                                                biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info));
-
-    _are_weights_converted   = true;
-    _are_weights_reshaped    = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    _is_fc_after_conv        = true;
-    _is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _original_weights        = weights;
-
-    if(_weights_manager)
-    {
-        _weights_manager->manage(weights);
-    }
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
+                                                               output->info(), fc_info, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info);
 
-    const ITensor *weights_to_use = weights;
+    _impl->op               = std::make_unique<cpu::CpuFullyConnected>();
+    _impl->original_weights = weights;
+    _impl->is_prepared      = false;
 
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-    if(is_batched_fc_layer)
-    {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                                  input->info()->tensor_shape().cend(),
-                                                                                  output->info()->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        _is_fc_after_conv = input->info()->num_dimensions() > 1;
-    }
-
-    // Reshape weights if needed
-    if(!_are_weights_reshaped)
-    {
-        if(_weights_manager && _weights_manager->are_weights_managed(weights))
-        {
-            _reshape_weights_managed_function.configure(weights);
-            weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed_function);
-        }
-        else
-        {
-            // Reshape the weights
-            _reshape_weights_function.configure(weights, &_reshape_weights_output);
-            weights_to_use = &_reshape_weights_output;
-        }
-    }
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+                         fc_info, weights_info);
 
-    // Convert weights if needed
-    if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+    if (_impl->weights_manager != nullptr)
     {
-        if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
-        {
-            _convert_weights_managed.configure(weights_to_use,
-                                               input->info()->tensor_shape(),
-                                               fc_info.weights_trained_layout);
-            weights_to_use = _weights_manager->acquire(weights, &_convert_weights_managed);
-        }
-        else
-        {
-            // Convert weights
-            _convert_weights.configure(weights_to_use,
-                                       &_converted_weights_output,
-                                       input->info()->tensor_shape(),
-                                       fc_info.weights_trained_layout);
-
-            weights_to_use = &_converted_weights_output;
-        }
-        _are_weights_converted = false;
+        _impl->weights_manager->manage(_impl->original_weights);
     }
 
-    if(_is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(input, weights_to_use, biases, output, fc_info.activation_info);
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(input, weights_to_use, biases, output, fc_info.activation_info);
-    }
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 
-    _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                       FullyConnectedLayerInfo fc_info)
+Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                                           const ITensorInfo             *input,
+                                           const ITensorInfo             *weights,
+                                           const ITensorInfo             *biases,
+                                           const ITensorInfo             *output,
+                                           const FullyConnectedLayerInfo &fc_info,
+                                           const WeightsInfo             &weights_info)
 {
-    ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1);
-
-    bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    bool is_fc_after_conv = true;
-
-    const ITensorInfo &flatten_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    const ITensorInfo *input_to_use   = input;
-    const ITensorInfo *weights_to_use = weights;
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = output->dimension(1) > 1;
-
-    if(is_batched_fc_layer)
-    {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
-                                                                                 input->tensor_shape().cend(),
-                                                                                 output->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        is_fc_after_conv = input->num_dimensions() > 1;
-    }
-
-    if(!weights_reshaped)
-    {
-        // Validate reshape weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
-        weights_to_use = &reshaped_weights;
-    }
-
-    if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
-    {
-        // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                             &converted_weights,
-                                                                             input->tensor_shape(),
-                                                                             fc_info.weights_trained_layout));
-        weights_to_use = &converted_weights;
-    }
-
-    if(is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
-
-        // Validate flatten kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
-        input_to_use = &flatten_input;
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
-    }
-    // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(input_to_use, weights_to_use, biases, output, fc_info.activation_info));
+    return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info,
+                                                weights_info);
+}
 
-    return Status{};
+Status NEFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
+                                       FullyConnectedLayerInfo fc_info,
+                                       const WeightsInfo      &weights_info)
+{
+    return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info);
 }
 
 void NEFullyConnectedLayer::run()
 {
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (!_impl->dynamic_weights)
     {
-        NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+        prepare();
     }
 
-    // Run matrix multiply
-    if(_is_quantized_asymmetric)
-    {
-        _mm_gemmlowp.run();
-    }
-    else
-    {
-        _mm_gemm.run();
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
 void NEFullyConnectedLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        if(!_weights_manager)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-        }
-
-        auto release_unused = [](Tensor * w)
-        {
-            if(!w->is_used())
-            {
-                w->allocator()->free();
-            }
-        };
+        _impl->op->prepare(_impl->run_pack);
 
-        // Pointer to current weights
-        const ITensor *cur_weights = _original_weights;
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+        _impl->is_prepared = true;
 
-        // Reshape of the weights (happens only once)
-        if(!_are_weights_reshaped)
+        // Handle weights managed infrastructure
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
-            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+            // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
+            // This is for cases where multiple functions share the same b (weights)
+            // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
+            const ITensor *original_b = _impl->original_weights;
+            if (!original_b->is_used())
             {
-                cur_weights = _weights_manager->run(cur_weights, &_reshape_weights_managed_function);
+                _impl->weights_manager->pre_mark_as_unused(original_b);
             }
-            else
-            {
-                // Reshape of the weights (happens only once)
-                if(!_are_weights_reshaped)
-                {
-                    // Run reshape weights kernel and mark weights as unused
-                    _reshape_weights_output.allocator()->allocate();
-                    _reshape_weights_function.run();
-                }
-                cur_weights->mark_as_unused();
-                cur_weights = &_reshape_weights_output;
-            }
-            _are_weights_reshaped = true;
-        }
-
-        // Convert weights if needed (happens only once)
-        if(!_are_weights_converted)
-        {
-            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
-            {
-                _weights_manager->run(cur_weights, &_convert_weights_managed);
-            }
-            else
-            {
-                _converted_weights_output.allocator()->allocate();
-                _convert_weights.run();
-                cur_weights->mark_as_unused();
-            }
-
-            _are_weights_converted = true;
-        }
-
-        // Release reshaped weights if unused
-        release_unused(&_reshape_weights_output);
-
-        // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized_asymmetric)
-        {
-            _mm_gemm.prepare();
+            _impl->original_weights->mark_as_used();
+            _impl->weights_manager->release(_impl->original_weights);
         }
-
-        // Release converted weights if unused
-        release_unused(&_reshape_weights_output);
-        release_unused(&_converted_weights_output);
-
-        _is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index 68dc159f75..f5b8b57dac 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,31 +29,53 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+
 namespace arm_compute
 {
-NEFuseBatchNormalization::NEFuseBatchNormalization()
-    : _fuse_bn_kernel()
+NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
+
+NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel()
 {
 }
 
-void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                         ITensor *fused_weights, ITensor *fused_bias,
-                                         const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalization::configure(const ITensor             *input_weights,
+                                         const ITensor             *bn_mean,
+                                         const ITensor             *bn_var,
+                                         ITensor                   *fused_weights,
+                                         ITensor                   *fused_bias,
+                                         const ITensor             *input_bias,
+                                         const ITensor             *bn_beta,
+                                         const ITensor             *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
+
+    _fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>();
+    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                               epsilon, fbn_type);
 }
 
-Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void NEFuseBatchNormalization::run()
 {
-    NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fuse_bn_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 2bd459a389..934a8250cc 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,354 +23,140 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/TensorAllocator.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
 
-#include <cmath>
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemm.h"
 
-using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
 
 namespace arm_compute
 {
-NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
-      _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
-      _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
-{
-}
-
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+struct NEGEMM::Impl
 {
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
-
-    const bool is_c_bias     = gemm_info.reshape_b_only_on_first_run();
-    bool       run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _is_prepared                      = false;
-    _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
-    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-    _original_b                       = b;
-    _run_alpha_scale                  = alpha != 1.f;
-    _run_bias_addition                = c != nullptr && gemm_info.reshape_b_only_on_first_run();
-    _run_addition                     = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
-    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !NEGEMMAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
-
-    if(run_optimised)
-    {
-        const ITensor *c_to_use = is_c_bias ? c : nullptr;
-        if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE)
-        {
-            GEMMInfo gemm_info_ntb = gemm_info;
-            gemm_info_ntb.set_pretranpose_B(false);
-            _asm_glue.configure(a, b, c_to_use, d, gemm_info_ntb);
-        }
-        else
-        {
-            _asm_glue.configure(a, b, c_to_use, d, gemm_info);
-        }
-        ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
-
-        // Scale product by alpha
-        if(_run_alpha_scale)
-        {
-            _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
-        }
-    }
-    else
-    {
-        // Pick output tensor in case bias addition should be performed
-        ITensor *gemm_output_to_use = d;
-        if(_run_bias_addition)
-        {
-            gemm_output_to_use = &_tmp_d;
-            _memory_group.manage(&_tmp_d);
-        }
-
-        // Select between GEMV and GEMM
-        if(_run_vector_matrix_multiplication)
-        {
-            // Configure the matrix multiply kernel
-            _mm_kernel.configure(a, b, gemm_output_to_use, alpha, false);
-        }
-        else
-        {
-            TensorShape shape_tmp_a = a->info()->tensor_shape();
-            TensorShape shape_tmp_b = b->info()->tensor_shape();
+    MemoryGroup      memory_group{};
+    IWeightsManager *weights_manager{nullptr};
 
-            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+    std::unique_ptr<cpu::CpuGemm> op{nullptr};
 
-            const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
-            shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
-            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
+    const ITensor *original_b{nullptr};
+    bool           is_prepared{false};
 
-            TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
-            TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
+    ITensorPack                      run_pack{};
+    ITensorPack                      prep_pack{};
+    WorkspaceData<Tensor>            workspace{};
+    experimental::MemoryRequirements aux_mem_req{};
+};
 
-            _tmp_a.allocator()->init(info_a);
-            _tmp_b.allocator()->init(info_b);
-
-            // Manage intermediate buffers
-            _memory_group.manage(&_tmp_a);
-            if(!_reshape_b_only_on_first_run)
-            {
-                _memory_group.manage(&_tmp_b);
-            }
-
-            int m = a->info()->dimension(1);
-            int n = b->info()->dimension(0);
-            int k = a->info()->dimension(0);
-
-            // Configure interleave kernel
-            _interleave_kernel.configure(a, &_tmp_a);
-
-            // Configure transpose kernel
-            _transpose_kernel.configure(b, &_tmp_b);
-
-            // Configure matrix multiplication kernel
-            _mm_kernel.configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
-
-            // Allocate once the all configure methods have been called
-            _tmp_a.allocator()->allocate();
-            if(!_reshape_b_only_on_first_run)
-            {
-                _tmp_b.allocator()->allocate();
-            }
-        }
-
-        if(_run_bias_addition)
-        {
-            _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
-            _tmp_d.allocator()->allocate();
-        }
-    }
-
-    // Configure matrix addition kernel
-    if(_run_addition)
-    {
-        _ma_kernel.configure(c, d, beta);
-    }
-
-    // Configure activation
-    const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(_run_activation)
-    {
-        _activation_func.configure(d, nullptr, activation);
-    }
+NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+    : _impl(std::make_unique<Impl>())
+{
+    _impl->memory_group    = MemoryGroup(std::move(memory_manager));
+    _impl->weights_manager = weights_manager;
 }
 
-Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
+NEGEMM::~NEGEMM() = default;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-    if(a->data_type() != DataType::BFLOAT16)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output);
-    }
+void NEGEMM::configure(const ITensor  *a,
+                       const ITensor  *b,
+                       const ITensor  *c,
+                       ITensor        *d,
+                       float           alpha,
+                       float           beta,
+                       const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr,
+                                                      d->info(), alpha, beta, gemm_info));
 
-    if(c != nullptr && !is_c_bias)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
-        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-    }
+    // Check if we need to reshape the matrix B only on the first run
+    _impl->is_prepared = false;
+    _impl->original_b  = b;
+    _impl->op          = std::make_unique<cpu::CpuGemm>();
 
-    if(output->total_size() != 0)
+    // Make the B matrix dynamic values.
+    auto b_info_to_use = b->info()->clone();
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 0)
-        {
-            if(gemm_info.reinterpret_input_as_3d())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
-            }
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-        }
+        b_info_to_use->set_are_values_constant(false);
     }
 
-    // Check if we need to run the optimized assembly kernel
-    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info));
-
-    if(!run_optimised)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
-        // Check if the first input tensor is a vector.
-        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-        // Check if we need to reshape the matrix A and matrix B
-        const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
-
-        // Arguments used by GEMMReshapeInfo
-        // If we pass the matrix A and matrix B reshaped to NEGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to NEGEMMReshapeInfo
-        // in order to know how the matrices have been reshaped
-        const int m                         = a->dimension(1);
-        const int n                         = b->dimension(0);
-        const int k                         = a->dimension(0);
-        int       mult_transpose1xW_width   = 1;
-        int       mult_interleave4x4_height = 1;
-
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
-
-        const ITensorInfo *matrix_a_info = a;
-        const ITensorInfo *matrix_b_info = b;
-
-        TensorInfo tmp_a_info{};
-        TensorInfo tmp_b_info{};
-        TensorInfo tmp_output_info = *output->clone();
-
-        if(run_interleave_transpose)
-        {
-            matrix_a_info = &tmp_a_info;
-            matrix_b_info = &tmp_b_info;
-
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
-
-            // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
-        }
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta,
+                         gemm_info);
 
-        // Validate matrix multiply
-        auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
-
-        if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE));
-        }
-    }
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->run_pack    = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}};
+    _impl->prep_pack   = {{ACL_SRC_1, b}, {ACL_SRC_2, c}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+}
 
-    // Validate matrix addition kernel
-    if(beta != 0 && c != nullptr && !is_c_bias)
+Status NEGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
+{
+    // Make the B matrix dynamic values.
+    auto b_to_use = b->clone();
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta));
+        b_to_use->set_are_values_constant(false);
     }
 
-    // Validate activation
-    const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
-    }
+    return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info);
+}
 
-    return Status{};
+Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                            const ITensorInfo         *a,
+                            const ITensorInfo         *b,
+                            const ITensorInfo         *c,
+                            const ITensorInfo         *output,
+                            float                      alpha,
+                            float                      beta,
+                            const GEMMInfo            &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha, beta);
+    return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info);
 }
 
 void NEGEMM::run()
 {
     prepare();
 
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_asm_glue.is_configured())
-    {
-        _asm_glue.run();
-        if(_run_alpha_scale)
-        {
-            _alpha_scale_func.run();
-        }
-    }
-    else
-    {
-        if(!_run_vector_matrix_multiplication)
-        {
-            // Run interleave kernel
-            NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
-
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-            }
-        }
-
-        NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
-
-        // Run bias addition kernel
-        if(_run_bias_addition)
-        {
-            NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
-        }
-    }
-
-    // Run matrix addition kernel
-    if(_run_addition)
-    {
-        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
-    }
-
-    // Run activation function
-    if(_run_activation)
-    {
-        _activation_func.run();
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
 void NEGEMM::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
-        if(_asm_glue.is_configured())
-        {
-            if(!original_b_managed_by_weights_manager)
-            {
-                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-            }
+        _impl->op->prepare(_impl->prep_pack);
+
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-            _asm_glue.prepare();
-            if(!original_b_managed_by_weights_manager)
-            {
-                _original_b->mark_as_unused();
-            }
+        if (has_reshape != std::end(_impl->aux_mem_req))
+        {
+            _impl->original_b->mark_as_unused();
         }
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
+        else
         {
-            if(!original_b_managed_by_weights_manager)
-            {
-                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-            }
-
-            _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-            if(!original_b_managed_by_weights_manager)
-            {
-                _original_b->mark_as_unused();
-            }
+            _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b);
         }
 
-        _is_prepared = true;
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
deleted file mode 100644
index 24bd7d7a8c..0000000000
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ /dev/null
@@ -1,622 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
-{
-    arm_gemm::Activation gemm_act;
-
-    // Early exit in case lower bound is other than 0, as it's not yet supported
-    if(act.b() != 0.f)
-    {
-        return gemm_act;
-    }
-
-    switch(act.activation())
-    {
-        case ActivationLayerInfo::ActivationFunction::RELU:
-            gemm_act.type = arm_gemm::Activation::Type::ReLU;
-            break;
-        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
-            gemm_act.param1 = act.a();
-            gemm_act.param2 = 0.f;
-            break;
-        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
-            gemm_act.param1 = act.a();
-            gemm_act.param2 = act.b();
-            break;
-        default:
-            gemm_act.type = arm_gemm::Activation::Type::None;
-    }
-
-    return gemm_act;
-}
-
-template <typename TypeInput, typename TypeOutput>
-class FallbackTransform : public ITransformWeights
-{
-public:
-    FallbackTransform() noexcept {};
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    FallbackTransform(const FallbackTransform &) = delete;
-    /** Default move constructor */
-    FallbackTransform(FallbackTransform &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    FallbackTransform &operator=(const FallbackTransform &) = delete;
-    /** Default move assignment operator */
-    FallbackTransform &operator=(FallbackTransform &&) = default;
-    void               run() override
-    {
-        _output.allocator()->allocate();
-        ARM_COMPUTE_ERROR_ON(_output.buffer() == nullptr);
-        _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b);
-        _reshape_run = true;
-    }
-
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    ITensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    uint32_t uid() override
-    {
-        uint32_t id = (_B_pretranspose_size | 0x80000000);
-        return id;
-    }
-
-    void configure(size_t B_pretranspose_size, unsigned int alignment)
-    {
-        _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
-        _B_pretranspose_size = B_pretranspose_size;
-    }
-
-    void set_pretranspose(ITensor *tensor)
-    {
-        if(!_reshape_run)
-        {
-            _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer());
-        }
-    }
-
-    void set_args(const int ldb, const TypeInput *in1_ptr, const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm)
-    {
-        _ldb             = ldb;
-        _in1_ptr         = in1_ptr;
-        _multi_stride_b  = multi_stride_b;
-        _gemm_kernel_asm = gemm_kernel_asm;
-    }
-
-private:
-    Tensor           _output{};
-    int              _ldb{};
-    const TypeInput *_in1_ptr{};
-    int              _multi_stride_b{};
-    size_t           _B_pretranspose_size{};
-    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
-};
-
-/** Fallback in case ACL doesn't have a function */
-template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
-class Fallback : public NEGEMMAssemblyDispatch::IFallback
-{
-public:
-    /** Destructor */
-    ~Fallback()
-    {
-        // Release memory if we have allocated the memory ourselves
-        if(_pretranspose && !(_weights_manager && _weights_manager->are_weights_managed(_b)))
-        {
-            delete _pretranspose;
-        }
-    }
-
-    /** Initialise the functions's input and output.
-     *
-     * @param[in]  a               Input tensor containing the Matrix A.
-     * @param[in]  b               Input tensor containing the Matrix B.
-     * @param[in]  c               Input tensor containing the Matrix C.
-     * @param[out] d               Output tensor to store the result of matrix multiplication.
-     * @param[in]  args            Matrix multiplication information.
-     * @param[in]  gemm_info       GEMM meta-data
-     * @param[in]  memory_group    Memory group to be used by the function.
-     * @param[in]  weights_manager Weights manager to be used by the function.
-     * @param[in]  os              Output stage meta-data.
-     */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                   arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
-                   MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
-
-    /** Set requantization shifts to be used
-     *
-     * @param[in] shifts Requantization shifts
-     *
-     * @return Pointer to the shift data
-     */
-    /** Set requantization data to be used
-      *
-      *
-      * @param shifts       Requantization shifts
-      * @param multipliers  Requantization multipliers
-      *
-      * @return A tuple with the pointers to the shift and multiplier data respectively
-      */
-    std::tuple<const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                     const std::vector<int32_t> &multipliers);
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-    bool is_configured() const override;
-
-private:
-    /** Allocate a workspace tensor.
-     *
-     * @param[in] workspace_size Size to allocate.
-     * @param[in] memory_group   Tensor memory group.
-     * @param[in] alignment      Workspace memory alignment.
-     */
-    void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
-
-    /** Assembly Gemm kernel */
-    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
-    /** Optimised NEON kernel */
-    std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
-    /** Input A */
-    const ITensor *_a
-    {
-        nullptr
-    };
-    /** Input B */
-    const ITensor *_b
-    {
-        nullptr
-    };
-    const ITensor *_c
-    {
-        nullptr
-    };
-    /** Output */
-    ITensor *_d{ nullptr };
-    /** GEMM workspace */
-    Tensor _workspace{};
-    /** Pre-transpose tensor */
-    ITensor *_pretranspose{ nullptr };
-    /** Prepared flag */
-    bool _is_prepared{ false };
-    /** GEMM meta-data */
-    GEMMInfo _gemm_info{};
-    /** Weights manager */
-    IWeightsManager *_weights_manager{ nullptr };
-    /** Weights transform object */
-    FallbackTransform<TypeInput, TypeOutput> _weights_transform{};
-    /** GEMM kernel description */
-    arm_gemm::KernelDescription _kernel_info{};
-    /** Per channel quantization shifts */
-    std::vector<int32_t> _shifts{};
-    /** Per channel quantization multipliers */
-    std::vector<int32_t> _multipliers{};
-};
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-std::tuple<const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                                                               const std::vector<int32_t> &multipliers)
-{
-    _multipliers = multipliers;
-    _shifts      = shifts;
-    std::transform(_shifts.begin(), _shifts.end(), _shifts.begin(), std::negate<int32_t>());
-    return std::make_tuple(_shifts.data(), _multipliers.data());
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                                                             arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
-                                                             MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
-{
-    arm_gemm::GemmConfig gemm_cfg;
-    _kernel_info     = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
-    _weights_manager = weights_manager;
-    if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
-    {
-        gemm_cfg.filter = _kernel_info.name;
-        args._cfg       = &gemm_cfg;
-    }
-    _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
-    if(_gemm_kernel_asm == nullptr)
-    {
-        //configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    // arm_compute wrapper for the Gemm object (see above)
-    std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
-    ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
-    acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
-    const size_t workspace_size = _gemm_kernel_asm->get_working_size();
-    if(workspace_size > 0)
-    {
-        // Allocate workspace
-        const unsigned int alignment = 4096;
-        allocate_workspace(workspace_size, memory_group, alignment);
-    }
-
-    //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
-    //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
-    {
-        const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
-        if(window_size < static_cast<unsigned int>(args._maxthreads))
-        {
-            _gemm_kernel_asm->set_nthreads(window_size);
-        }
-    }
-
-    _optimised_kernel = std::move(acl_gemm_wrapper);
-    _a                = a;
-    _b                = b;
-    _c                = c;
-    _d                = d;
-    _gemm_info        = gemm_info;
-    // Check for pre-transposed support
-    if(_gemm_kernel_asm->B_pretranspose_required())
-    {
-        // Forcing 128-byte alignment (required by 32-bit kernels)
-        const unsigned int alignment           = 128;
-        const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
-        if(weights_manager && _weights_manager->are_weights_managed(b))
-        {
-            _weights_transform.configure(B_pretranspose_size, alignment);
-            _pretranspose = _weights_manager->acquire(b, &_weights_transform);
-        }
-        else
-        {
-            _pretranspose = new Tensor();
-            static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
-        }
-    }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::prepare()
-{
-    if(!_is_prepared)
-    {
-        // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-        if(_c && _c->info()->data_type() == DataType::S32)
-        {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()), 0);
-        }
-
-        // Pretranspose B if required
-        if(_gemm_kernel_asm->B_pretranspose_required())
-        {
-            const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
-            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
-
-            if(_weights_manager && _weights_manager->are_weights_managed(_b))
-            {
-                _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm);
-                _weights_manager->run(_b, &_weights_transform);
-
-                // If we didn't run the reshape function, set the pretransposed buffer
-                if(!_weights_transform.is_reshape_run())
-                {
-                    _weights_transform.set_pretranspose(_pretranspose);
-                }
-            }
-            else
-            {
-                static_cast<Tensor *>(_pretranspose)->allocator()->allocate();
-                ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr);
-                _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
-                _b->mark_as_unused();
-            }
-        }
-
-        _is_prepared = true;
-    }
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
-    _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
-    memory_group.manage(&_workspace);
-    _workspace.allocator()->allocate();
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
-{
-    return _optimised_kernel != nullptr;
-}
-
-template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::run()
-{
-    const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
-    int       ldb = 0;
-    const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
-
-    const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d() != 0 ? 3 : 2;
-    const size_t a_multi_idx = a_batch_idx + 1;
-    const size_t d_batch_idx = _gemm_info.depth_output_gemm3d() != 0 ? 3 : 2;
-    const size_t d_multi_idx = d_batch_idx + 1;
-
-    const int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
-    const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
-
-    const int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
-    int       multi_stride_b = 0;
-    const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
-
-    const auto       in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
-    const TypeInput *in1_ptr = nullptr;
-    auto             out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
-
-    // Check if B is pre-tranposed and de-reference if not
-    if(!_gemm_kernel_asm->B_is_pretransposed())
-    {
-        ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
-        multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
-        in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
-    }
-
-    // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
-    if(_workspace.buffer() != nullptr)
-    {
-        _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
-        const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
-        unsigned int       num_threads = NEScheduler::get().num_threads();
-        if(window_size < num_threads)
-        {
-            num_threads = window_size;
-            _gemm_kernel_asm->set_nthreads(num_threads);
-        }
-    }
-
-    // Prepare assembly kernel
-    prepare();
-
-    TypeOutput *bias = nullptr;
-    // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-    if(_c && _c->info()->data_type() != DataType::S32)
-    {
-        bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
-    }
-    // Set gemm parameters
-    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
-                                 in1_ptr, ldb, multi_stride_b,
-                                 out_ptr, ldd, batch_stride_d, multi_stride_d,
-                                 bias, 0);
-    // Schedule assembly kernel
-    IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
-    if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
-    {
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
-
-    }
-    else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && _d->info()->data_type() == DataType::F32)
-    {
-        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
-    }
-
-    NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                     const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
-                     IWeightsManager *weights_manager)
-{
-    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
-    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
-    unsigned int                 num_threads = NEScheduler::get().num_threads();
-
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
-
-    // Create arm_gemm fallback
-    auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
-    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager);
-    arm_gemm = std::move(fallback);
-}
-
-template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                           const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
-                           IWeightsManager *weights_manager)
-{
-    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
-    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
-    unsigned int                 num_threads = NEScheduler::get().num_threads();
-
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
-
-    // Create arm_gemm fallback
-    auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
-
-    // Configure requantization info
-    const int32_t                 a_offset = -a->info()->quantization_info().uniform().offset;
-    const int32_t                 b_offset = -b->info()->quantization_info().uniform().offset;
-    const GEMMLowpOutputStageInfo os_info  = gemm_info.gemmlowp_output_stage();
-
-    arm_gemm::Requantize32 gemm_requant_info{};
-    if(os_info.gemmlowp_shifts.size() > 1)
-    {
-        const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
-        gemm_requant_info          = arm_gemm::Requantize32(nullptr, 0,
-                                                            a_offset, b_offset, os_info.gemmlowp_offset,
-                                                            std::get<0>(requantize_data), std::get<1>(requantize_data),
-                                                            os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
-    }
-    else
-    {
-        gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
-                                                   a_offset, b_offset, os_info.gemmlowp_offset,
-                                                   -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
-                                                   os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
-    }
-
-    // Configure fallback
-    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info);
-    arm_gemm = std::move(fallback);
-}
-
-} //namespace
-
-NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager)
-{
-}
-
-Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(gemm_info, c);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-#ifndef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
-#endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(is_data_type_quantized_per_channel(b->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
-    return Status{};
-}
-
-bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
-{
-    arm_gemm::Activation act = map_to_arm_gemm_activation(activation);
-    return act.type != arm_gemm::Activation::Type::None;
-}
-
-void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info());
-
-    //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info))
-    {
-        return;
-    }
-
-    switch(a->info()->data_type())
-    {
-        case DataType::F32:
-            create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
-            break;
-#ifdef __aarch64__
-        case DataType::U8:
-        case DataType::QASYMM8:
-            if(d->info()->data_type() == DataType::S32)
-            {
-                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
-            }
-            else
-            {
-                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
-            }
-            break;
-        case DataType::S8:
-        case DataType::QASYMM8_SIGNED:
-            if(d->info()->data_type() == DataType::S32)
-            {
-                create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
-            }
-            else
-            {
-                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
-            }
-            break;
-#endif /* __aarch64__ */
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-        case DataType::BFLOAT16:
-            create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
-            break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            break;
-    }
-}
-
-void NEGEMMAssemblyDispatch::prepare()
-{
-    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-    _arm_gemm->prepare();
-}
-
-bool NEGEMMAssemblyDispatch::is_configured() const
-{
-    return _arm_gemm != nullptr && _arm_gemm->is_configured();
-}
-
-void NEGEMMAssemblyDispatch::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-    _arm_gemm->run();
-}
-} //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
new file mode 100644
index 0000000000..6cca02eea9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+
+namespace arm_compute
+{
+using OperatorType = cpu::CpuGemmDirectConv2d;
+using namespace arm_compute::experimental;
+
+struct NEGEMMConv2d::Impl
+{
+    const ITensor                   *weights{nullptr};
+    std::unique_ptr<OperatorType>    op{nullptr};
+    ITensorPack                      run_pack{};
+    ITensorPack                      prep_pack{};
+    WorkspaceData<Tensor>            workspace{};
+    MemoryGroup                      memory_group{};
+    bool                             is_prepared{false};
+    experimental::MemoryRequirements aux_mem_req{};
+};
+
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>())
+{
+    _impl->memory_group = MemoryGroup(memory_manager);
+}
+
+NEGEMMConv2d::~NEGEMMConv2d() = default;
+
+void NEGEMMConv2d::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    _impl->weights     = weights;
+    _impl->is_prepared = false;
+    _impl->op          = std::make_unique<OperatorType>();
+
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         info);
+
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+}
+
+Status NEGEMMConv2d::validate(const ITensorInfo *input,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *output,
+                              const Conv2dInfo  &info)
+{
+    return OperatorType::validate(input, weights, biases, output, info);
+}
+
+void NEGEMMConv2d::run()
+{
+    prepare();
+
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
+}
+
+void NEGEMMConv2d::prepare()
+{
+    if (!_impl->is_prepared)
+    {
+        _impl->op->prepare(_impl->prep_pack);
+
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+        if (has_reshape != std::end(_impl->aux_mem_req))
+        {
+            _impl->weights->mark_as_unused();
+        }
+        else
+        {
+            _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights);
+        }
+
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+        _impl->is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a41d23f8d7..c8f65d2fd9 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,571 +26,109 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
 
-#include <set>
-#include <tuple>
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
 
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel()
-{
-}
-
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output)
-{
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
-                                                                          (biases != nullptr) ? biases->info() : nullptr,
-                                                                          output->info()));
-    const bool     append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
-
-    _weights_reshape_kernel.configure(weights, biases_to_use, output);
-
-    output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1,
-                                                         DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-
-        NEWeightsReshapeKernel::validate(weights, biases, output);
-    }
-
-    return Status{};
-}
+using namespace arm_compute::experimental;
 
-void NEConvolutionLayerReshapeWeights::run()
-{
-    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-}
-
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
-      _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _skip_im2col(false),
-      _skip_col2im(false), _is_quantized(false), _is_prepared(false)
+namespace arm_compute
 {
-}
-
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
+struct NEGEMMConvolutionLayer::Impl
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(),
-                                           act_info, gemm_3d_depth, _skip_im2col));
-
-    // Create GEMMInfo structure
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
-    // Supported activations in GEMM
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-
-    if(_is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo        iqinfo    = input->info()->quantization_info();
-        const QuantizationInfo        wqinfo    = weights->info()->quantization_info();
-        const QuantizationInfo        oqinfo    = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
-        const UniformQuantizationInfo uiqinfo   = iqinfo.uniform();
-        const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
-        const DataType                data_type = input->info()->data_type();
-
-        input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
-        if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
-        {
-            const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
-            weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
-        }
-
-        // Merge activation with output stage
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-        int32_t min_activation = type_min.get<int32_t>();
-        int32_t max_activation = type_max.get<int32_t>();
-
-        if(supported_acts.count(act_info.activation()) != 0)
-        {
-            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
-        }
-
-        GEMMLowpOutputStageInfo output_info;
-        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset          = uoqinfo.offset;
-        output_info.gemmlowp_min_bound       = min_activation;
-        output_info.gemmlowp_max_bound       = max_activation;
-        output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
-        quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
-
-        _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
-
-        // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
-        input->info()->set_quantization_info(iqinfo);
-        weights->info()->set_quantization_info(wqinfo);
-    }
-    else
-    {
-        // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info);
-    }
-}
-
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                           const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+    const ITensor                      *weights{nullptr};
+    std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr};
+    ITensorPack                         run_pack{};
+    MemoryGroup                         memory_group{};
+    IWeightsManager                    *weights_manager{nullptr};
+    MemoryRequirements                  aux_mem_req{};
+    WorkspaceData<Tensor>               workspace_tensors{};
+    bool                                is_prepared{false};
+};
+
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager,
+                                               IWeightsManager                       *weights_manager)
+    : _impl(std::make_unique<Impl>())
 {
-    const DataType data_type             = input->data_type();
-    const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
-    const bool     is_activation_enabled = act_info.enabled();
-
-    // Create GEMMInfo structure
-    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
-    if(is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo       &iqinfo  = input->quantization_info();
-        const QuantizationInfo       &wqinfo  = weights->quantization_info();
-        const QuantizationInfo       &oqinfo  = (output->total_size() == 0) ? iqinfo : output->quantization_info();
-        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
-
-        // Merge activation with output stage
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-        int32_t min_activation = type_min.get<int32_t>();
-        int32_t max_activation = type_max.get<int32_t>();
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
-        {
-            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
-        }
-
-        GEMMLowpOutputStageInfo output_info;
-        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset          = uoqinfo.offset;
-        output_info.gemmlowp_min_bound       = min_activation;
-        output_info.gemmlowp_max_bound       = max_activation;
-        output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
-
-        // Perform validation step on GEMMLowp
-        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
-        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
-        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
-    }
-    else
-    {
-        // Perform validation step on Matrix multiply function
-        return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
-    }
+    _impl->weights_manager = weights_manager;
+    _impl->memory_group    = MemoryGroup(memory_manager);
 }
-
-Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
+
+void NEGEMMConvolutionLayer::configure(const ITensor             *input,
+                                       const ITensor             *weights,
+                                       const ITensor             *biases,
+                                       ITensor                   *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math,
+                                       unsigned int               num_groups)
 {
-    const DataType     data_type = input_info->data_type();
-    const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
-    const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
-
-    // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
-    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
+    _impl->weights = weights;
+    _impl->op      = std::make_unique<cpu::CpuGemmConv2d>();
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
+                         conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, input},
+                          {TensorType::ACL_SRC_1, weights},
+                          {TensorType::ACL_SRC_2, biases},
+                          {TensorType::ACL_DST, output}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        bool                       enable_fast_math,
+                                        unsigned int               num_groups)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(num_groups, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
-                                                                weights->info(),
-                                                                biases != nullptr ? biases->info() : nullptr,
-                                                                output->info(),
-                                                                conv_info,
-                                                                weights_info,
-                                                                dilation,
-                                                                act_info,
-                                                                num_groups));
-
-    const DataType   data_type   = input->info()->data_type();
-    const DataLayout data_layout = input->info()->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->info()->dimension(idx_width);
-    const unsigned int kernel_height = weights->info()->dimension(idx_height);
-
-    _is_prepared      = weights_info.retain_internal_weights();
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _data_layout      = data_layout;
-    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    const ITensor *gemm_input_to_use  = input;
-    ITensor       *gemm_output_to_use = output;
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
-                                                 input->info()->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-
-    // Check if GEMM3D is supported
-    if(data_layout == DataLayout::NHWC)
-    {
-        _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true));
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!_skip_col2im)
-        {
-            _skip_im2col = false;
-        }
-    }
-    else
-    {
-        _skip_col2im = false;
-    }
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
-
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
-    const ITensor *weights_to_use = weights;
-
-    if(_weights_manager && _weights_manager->are_weights_managed(weights))
-    {
-        _reshape_weights_managed.configure(weights, nullptr);
-        weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed);
-    }
-    else
-    {
-        _reshape_weights.configure(weights, nullptr, &_weights_reshaped);
-        weights_to_use = &_weights_reshaped;
-    }
-
-    // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
-    {
-        _memory_group.manage(&_im2col_output);
-
-        // Configure
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
-
-        // Update GEMM input
-        gemm_input_to_use = &_im2col_output;
-    }
-
-    // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!_skip_col2im)
-    {
-        TensorShape shape_gemm;
-
-        // Calculate GEMM output shape
-        shape_gemm = _im2col_output.info()->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
-        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-        TensorInfo info_gemm(shape_gemm, 1, output_data_type);
-        info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
-        _gemm_output.allocator()->init(info_gemm);
-        _memory_group.manage(&_gemm_output);
-
-        // Update GEMM output
-        gemm_output_to_use = &_gemm_output;
-    }
-
-    // Configure GEMM
-    // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
-    const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
-    configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth);
-
-    if(!_skip_im2col)
-    {
-        _im2col_output.allocator()->allocate();
-    }
-
-    if(!_skip_col2im)
-    {
-        if(_data_layout == DataLayout::NCHW)
-        {
-            // Configure col2im
-            _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
-        }
-        else
-        {
-            // Configure reshape layer
-            _reshape_layer.configure(gemm_output_to_use, output);
-        }
-    }
-
-    if(_is_quantized && !_skip_col2im)
-    {
-        _tmp_output.allocator()->allocate();
-    }
-
-    if(!_skip_col2im || _is_quantized)
-    {
-        _gemm_output.allocator()->allocate();
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
-                             "Output shape does not match the expected one");
+    return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                                        enable_fast_math, num_groups);
 }
 
-Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                            const ITensorInfo         *src,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *dst,
+                                            const PadStrideInfo       &conv_info,
+                                            const WeightsInfo         &weights_info,
+                                            const Size2D              &dilation,
+                                            const ActivationLayerInfo &act_info,
+                                            const bool                 enable_fast_math)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
-
-    const DataLayout data_layout = input->data_layout();
-    const DataType   data_type   = input->data_type();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->dimension(idx_width);
-    const unsigned int kernel_height = weights->dimension(idx_height);
-
-    TensorInfo         im2col_reshaped_info{};
-    TensorInfo         info_gemm{};
-    TensorInfo         tmp_info{};
-    TensorInfo         weights_reshaped_info{};
-    const ITensorInfo *gemm_input_to_use  = input;
-    const ITensorInfo *gemm_output_to_use = output;
-    const ITensorInfo *weights_to_use     = weights;
-
-    const bool append_bias  = false;
-    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
-    const bool is_bf16      = data_type == DataType::BFLOAT16;
-    bool       skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
-                                                 input->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-
-    // Check if GEMM3D is supported
-    bool skip_col2im = false;
-    if(data_layout == DataLayout::NHWC)
-    {
-        skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true));
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!skip_col2im)
-        {
-            skip_im2col = false;
-        }
-    }
-
-    if(skip_col2im)
-    {
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col)))
-        {
-            skip_im2col = false;
-            skip_col2im = false;
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    // Validate biases
-    if(biases != nullptr)
-    {
-        if(is_quantized)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else if(is_bf16)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
-
-    // Output tensor auto inizialization if not yet initialized
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr));
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
-    weights_reshaped_info.set_quantization_info(weights->quantization_info());
-    weights_to_use = &weights_reshaped_info;
-
-    if(!skip_im2col)
-    {
-        // Create tensor info for im2col reshaped inputs
-        // For NEON the batch size is on the fourth dimension
-        // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482
-        TensorShape shape_im2col = input->tensor_shape();
-        shape_im2col.set(0, mat_weights_rows);
-        shape_im2col.set(1, conv_w * conv_h);
-        shape_im2col.set(2, 1);
-
-        im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
-        im2col_reshaped_info.set_quantization_info(input->quantization_info());
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
-        gemm_input_to_use = &im2col_reshaped_info;
-    }
-
-    // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!skip_col2im)
-    {
-        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-        info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
-    }
-    else
-    {
-        info_gemm = TensorInfo(output->tensor_shape(), 1, output_data_type);
-    }
-    info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
-    gemm_output_to_use = &info_gemm;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
-
-    // Validate Col2Im/ReshapeLayer
-    if(!skip_col2im && (data_layout == DataLayout::NCHW))
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
-    }
-
-    return Status{};
+    return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info,
+                                            dilation, act_info, enable_fast_math);
 }
 
 void NEGEMMConvolutionLayer::run()
 {
     prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(!_skip_im2col)
-    {
-        // Run input reshaping
-        unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        NEScheduler::get().schedule(&_im2col_kernel, y_dim);
-    }
-
-    // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
-    if(_is_quantized)
-    {
-        // Run gemmlowp
-        _mm_gemmlowp.run();
-    }
-    else
-    {
-        // Run gemm
-        _mm_gemm.run();
-    }
-
-    // Reshape output matrix
-    if(!_skip_col2im)
-    {
-        if(_data_layout == DataLayout::NCHW)
-        {
-            NEScheduler::get().schedule(&_col2im_kernel, Window::DimY);
-        }
-        else
-        {
-            _reshape_layer.run();
-        }
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
 void NEGEMMConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
-        {
-            _weights_manager->run(_original_weights, &_reshape_weights_managed);
-        }
-        else
-        {
-            // Run weights reshaping and mark original weights tensor as unused
-            _weights_reshaped.allocator()->allocate();
-            _reshape_weights.run();
-            _original_weights->mark_as_unused();
-        }
-
-        // Prepare GEMM
-        _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
-        if(!_weights_reshaped.is_used())
-        {
-            _weights_reshaped.allocator()->free();
-        }
+        _impl->op->prepare(_impl->run_pack);
 
-        _is_prepared = true;
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors);
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
deleted file mode 100644
index 5692beb3f7..0000000000
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "support/MemorySupport.h"
-
-namespace arm_compute
-{
-void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
deleted file mode 100644
index 087df19e20..0000000000
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b()
-{
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-
-    bool run_optimised = false;
-    switch(a->info()->data_type())
-    {
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true));
-            run_optimised = _asm_glue.is_configured();
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Datatype not supported");
-            break;
-        }
-    }
-    if(!run_optimised)
-    {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
-
-        // Configure interleave kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-            k->configure(a, &_tmp_a);
-            _mtx_a_reshape_kernel = std::move(k);
-        }
-
-        // Configure transpose kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-            k->configure(b, &_tmp_b);
-            _mtx_b_reshape_kernel = std::move(k);
-        }
-
-        // Configure matrix multiply kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-            k->configure(&_tmp_a, &_tmp_b, output);
-            _mm_kernel = std::move(k);
-        }
-
-        // Allocate tensors
-        _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
-    }
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-    if(_mtx_a_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-    }
-
-    if(_mtx_b_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-    }
-
-    if(_asm_glue.is_configured())
-    {
-        _asm_glue.run();
-    }
-    else
-    {
-        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 993907c29a..44bfc6a51e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,564 +23,109 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+using namespace arm_compute::experimental;
 
 namespace arm_compute
 {
-using namespace arm_compute::misc::shape_calculator;
-
-NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
-      _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
-      _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0),
-      _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
-      _run_activation(false), _flip_signedness(false)
+struct NEGEMMLowpMatrixMultiplyCore::Impl
 {
+    const ITensor                                      *b{nullptr};
+    std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr};
+    ITensorPack                                         run_pack{};
+    ITensorPack                                         prep_pack{};
+    MemoryGroup                                         memory_group{};
+    IWeightsManager                                    *weights_manager{nullptr};
+    MemoryRequirements                                  aux_mem_req{};
+    WorkspaceData<Tensor>                               workspace_tensors{};
+    bool                                                is_prepared{false};
+};
+
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,
+                                                           IWeightsManager                *weights_manager)
+    : _impl(std::make_unique<Impl>())
+{
+    _impl->weights_manager = weights_manager;
+    _impl->memory_group    = MemoryGroup(memory_manager);
 }
+NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
 
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(
+    const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_UNUSED(c);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
-    const ITensor *matrix_a = a;
-    const ITensor *matrix_b = b;
-    GEMMInfo       info     = gemm_info;
-
-    // Set internal variables
-    _a_offset                         = a->info()->quantization_info().uniform().offset;
-    _b_offset                         = b->info()->quantization_info().uniform().offset;
-    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-    _reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
-    _is_prepared                      = false;
-    _fused_assembly_path              = false;
-    _flip_signedness                  = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
-    _original_b                       = b;
-
-    const ITensor *a_to_use = a;
-
-    // Convert to QASYMM8 -> QASYMM8_SIGNED and back
-    if(_flip_signedness)
-    {
-        const int32_t                 offset_correction = 128;
-        const DataType                dt                = DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo iqinfo            = a_to_use->info()->quantization_info().uniform();
-
-        _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
-        _memory_group.manage(&_signed_a);
-        _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
-        a_to_use  = &_signed_a;
-        _a_offset = _signed_a.info()->quantization_info().uniform().offset;
-
-        const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
-        _memory_group.manage(&_signed_output);
-        _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
-
-        // Output stage correction
-        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
-        output_stage_corr.gemmlowp_offset         = _signed_output.info()->quantization_info().uniform().offset;
-        output_stage_corr.gemmlowp_min_bound -= offset_correction;
-        output_stage_corr.gemmlowp_max_bound -= offset_correction;
-        info.set_gemmlowp_output_stage(output_stage_corr);
-
-        // Update matrix a
-        matrix_a = &_signed_a;
-    }
-
-    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-    {
-        _fuse_output_stage = true;
-        _memory_group.manage(&_mm_result_s32);
-        TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
-        _mm_result_s32.allocator()->init(info_mm_result_s32);
-    }
-
-#ifdef __aarch64__
-    switch(a->info()->data_type())
-    {
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::U8:
-        case DataType::S8:
-        {
-            if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-            {
-                _asm_glue.configure(a_to_use, b, c, output, gemm_info);
-                _fused_assembly_path = _asm_glue.is_configured();
-            }
-            else
-            {
-                _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
-            }
-            _assembly_path = _asm_glue.is_configured();
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Datatype not supported");
-            break;
-        }
-    }
-#endif /* __aarch64__ */
-    if(!(_assembly_path || _run_vector_matrix_multiplication))
-    {
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
 
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
-        _tmp_a.allocator()->init(a_info);
-        _tmp_b.allocator()->init(b_info);
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-
-        // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a);
-
-        // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b);
-    }
-
-    if(!_fused_assembly_path)
-    {
-        // Build reduction info
-        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
-
-        // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
-        {
-            TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-
-            _vector_sum_col.allocator()->init(info_vector_sum_col);
-            if(!_reshape_b_only_on_first_run)
-            {
-                _memory_group.manage(&_vector_sum_col);
-            }
-
-            // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
-        }
-
-        // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
-        {
-            TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
-
-            _vector_sum_row.allocator()->init(info_vector_sum_row);
-            _memory_group.manage(&_vector_sum_row);
-
-            // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
-        }
-
-        if(_fuse_output_stage)
-        {
-            // Configure matrix multiply kernel
-            if(!_assembly_path)
-            {
-                _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32);
-            }
-
-            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
-                                                               _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                               _flip_signedness ? &_signed_output : output,
-                                                               a->info()->dimension(0),
-                                                               _a_offset, _b_offset, info.gemmlowp_output_stage());
-
-            if(_flip_signedness)
-            {
-                _convert_from_signed_asymm.configure(&_signed_output, output);
-            }
-        }
-        else
-        {
-            // Configure matrix multiply kernel
-            if(!_assembly_path)
-            {
-                _mm_kernel.configure(matrix_a, matrix_b, output);
-            }
-            // Configure offset contribution kernel
-            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
-        }
-
-        // Configure activation
-        const ActivationLayerInfo &activation = gemm_info.activation_info();
-        _run_activation                       = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));
-        if(_run_activation)
-        {
-            _activation_func.configure(output, nullptr, activation);
-        }
-    }
-
-    // Allocate tensors
-    if(!_assembly_path && !_run_vector_matrix_multiplication)
-    {
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
-    }
-
-    if(!_fused_assembly_path)
-    {
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
-        {
-            _vector_sum_col.allocator()->allocate();
-        }
-
-        if(_b_offset != 0)
-        {
-            _vector_sum_row.allocator()->allocate();
-        }
-    }
-
-    if(_fuse_output_stage)
-    {
-        _mm_result_s32.allocator()->allocate();
-    }
-
-    if(_flip_signedness)
-    {
-        _signed_a.allocator()->allocate();
-        _signed_output.allocator()->allocate();
-    }
+    // Make the B matrix dynamic values.
+    auto b_info_to_use = b->info()->clone();
+    if (!gemm_info.reshape_b_only_on_first_run())
+    {
+        b_info_to_use->set_are_values_constant(false);
+    }
+
+    _impl->b  = b;
+    _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
+                         gemm_info);
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, a},
+                          {TensorType::ACL_SRC_1, b},
+                          {TensorType::ACL_SRC_2, c},
+                          {TensorType::ACL_DST, output}};
+    _impl->prep_pack   = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    GEMMInfo           info          = gemm_info;
-    const ITensorInfo *matrix_a_info = a;
-    const ITensorInfo *matrix_b_info = b;
-
-    const ITensorInfo *a_to_use = a;
-
-    TensorInfo tmp_a_info{};
-    TensorInfo tmp_b_info{};
-    TensorInfo mm_result_s32_info{};
-
-    int32_t a_offset = a->quantization_info().uniform().offset;
-    int32_t b_offset = b->quantization_info().uniform().offset;
-
-    bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
-    if(fuse_output_stage)
+    // Make the B matrix dynamic values.
+    auto b_info_to_use = b->clone();
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+        b_info_to_use->set_are_values_constant(false);
     }
 
-    // Convert QASYMM8->QASYMM8_SIGNED
-    TensorInfo signed_a{};
-    TensorInfo signed_output{};
-    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
-    if(flip_signedness)
-    {
-        const int32_t                 offset_correction = 128;
-        const DataType                dt                = DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
-
-        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
-        a_to_use = &signed_a;
-        a_offset = signed_a.quantization_info().uniform().offset;
-
-        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
-
-        // Output stage correction
-        GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
-        output_stage_corr.gemmlowp_offset         = signed_output.quantization_info().uniform().offset;
-        output_stage_corr.gemmlowp_min_bound -= offset_correction;
-        output_stage_corr.gemmlowp_max_bound -= offset_correction;
-        info.set_gemmlowp_output_stage(output_stage_corr);
-
-        // Update matrix a
-        matrix_a_info = &signed_a;
-    }
-
-    // Check if we need to run the optimized assembly kernel
-    bool run_optimised             = false;
-    bool run_optimised_requantized = false;
-    if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
-        run_optimised_requantized = run_optimised;
-
-        const UniformQuantizationInfo a_qinfo      = a_to_use->quantization_info().uniform();
-        const QuantizationInfo        b_qinfo      = b->quantization_info();
-        const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
-        for(auto const s : b_qinfo.scale())
-        {
-            const float fmultipler = a_qinfo.scale * s / output_qinfo.scale;
-            if(fmultipler > 1.f)
-            {
-                run_optimised_requantized = false;
-                break;
-            }
-        }
-    }
-    else
-    {
-        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
-    }
-
-    if(run_optimised)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(info.depth_output_gemm3d() != 0)
-        {
-            if(info.reinterpret_input_as_3d())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
-            }
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
-        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-        if(!run_vector_matrix_multiplication)
-        {
-            matrix_a_info = &tmp_a_info;
-            matrix_b_info = &tmp_b_info;
-
-            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorShape shape_tmp_a = a->tensor_shape();
-            shape_tmp_a.set(0, a->dimension(0) * 4);
-            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorShape shape_tmp_b = b->tensor_shape();
-            shape_tmp_b.set(0, b->dimension(1) * 16);
-            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
-        }
-    }
-
-    if(!run_optimised_requantized)
-    {
-        TensorInfo info_vector_sum_col{};
-        TensorInfo info_vector_sum_row{};
-
-        const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
-
-        // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-        if(a_offset != 0)
-        {
-            info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
-            // Configure Matrix B reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
-        }
-
-        // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(b_offset != 0)
-        {
-            info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
-            // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
-        }
-
-        if(fuse_output_stage)
-        {
-            if(!run_optimised)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
-            }
-
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                flip_signedness ? &signed_output : output,
-                                                                                                a_offset, b_offset,
-                                                                                                info.gemmlowp_output_stage()));
-        }
-        else
-        {
-            if(!run_optimised)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
-            }
-            // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
-                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                     a_offset, b_offset));
-        }
-    }
-
-    // Validate activation
-    const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
-    }
-
-    return Status{};
+    return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b_info_to_use.get(), c, output, gemm_info);
 }
 
 void NEGEMMLowpMatrixMultiplyCore::run()
 {
     prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Convert QASYMM8->QASYMM8_SIGNED
-    if(_flip_signedness)
-    {
-        NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
-    }
-
-    // Run GEMM
-    if(_asm_glue.is_configured())
-    {
-        _asm_glue.run();
-    }
-    else
-    {
-        if(!_run_vector_matrix_multiplication)
-        {
-            // Run interleave kernel
-            NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY);
-
-            if(!_reshape_b_only_on_first_run)
-            {
-                // Run transpose kernel
-                NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
-            }
-        }
-        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
-    }
-
-    if(!_fused_assembly_path)
-    {
-        // Run matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
-        {
-            NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
-        }
-
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
-        {
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
-        }
-
-        if(_fuse_output_stage)
-        {
-            // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
-        }
-        else
-        {
-            // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
-        }
-    }
-
-    // Convert QASYMM8_SIGNED->QASYMM8
-    if(_flip_signedness)
-    {
-        NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
-    }
-
-    // Run fused activation unless already run in the fused assembly
-    if(_run_activation && !_fused_assembly_path)
-    {
-        _activation_func.run();
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
-        // Run assembly reshape
-        if(_asm_glue.is_configured())
-        {
-            if(!original_b_managed_by_weights_manager)
-            {
-                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-            }
+        _impl->op->prepare(_impl->prep_pack);
 
-            _asm_glue.prepare();
-            if(!original_b_managed_by_weights_manager)
-            {
-                _original_b->mark_as_unused();
-            }
-        }
-        // Run non-assembly reshape
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
-        {
-            if(!original_b_managed_by_weights_manager)
-            {
-                ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-            }
-
-            // Run reshape kernel and mark original weights tensor as unused
-            _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
-            if(!original_b_managed_by_weights_manager)
-            {
-                _original_b->mark_as_unused();
-            }
-        }
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
-            _vector_sum_col.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+            _impl->b->mark_as_unused();
         }
 
-        _is_prepared = true;
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors);
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 43ca7b3fbb..8178003b5e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,179 +24,55 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
 
-namespace arm_compute
-{
-void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset         = result_offset;
-    info.gemmlowp_multiplier     = result_mult_int;
-    info.gemmlowp_shift          = result_shift;
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>();
-    k->configure(input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
 
-    return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
-}
-
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
-                                                                    int result_offset_after_shift, int min, int max)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-    _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
-                                                                   int result_offset_after_shift, int min, int max)
+namespace arm_compute
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-    _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+struct NEGEMMLowpOutputStage::Impl
 {
-    return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max)
+    const ITensor                               *src{nullptr};
+    const ITensor                               *bias{nullptr};
+    ITensor                                     *dst{nullptr};
+    ITensorPack                                  run_pack{};
+    std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr};
+};
+
+NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
-    _kernel = std::move(k);
 }
+NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
 
-Status NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
+void NEGEMMLowpOutputStage::configure(const ITensor                 *input,
+                                      const ITensor                 *bias,
+                                      ITensor                       *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
-
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            switch(info.output_data_type)
-            {
-                case DataType::QASYMM8:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QSYMM16:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-                    k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                default:
-                {
-                    ARM_COMPUTE_ERROR("Unsupported output data type.");
-                    break;
-                }
-            }
-            break;
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-        {
-            switch(info.output_data_type)
-            {
-                case DataType::QASYMM8:
-                case DataType::QASYMM8_SIGNED:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>();
-                    k->configure(input, bias, output, &info);
-                    _kernel = std::move(k);
-                    break;
-                }
-                default:
-                {
-                    ARM_COMPUTE_ERROR("Unsupported output data type.");
-                    break;
-                }
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(
+        NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
+    _impl->src  = input;
+    _impl->bias = bias;
+    _impl->dst  = output;
+    _impl->op   = std::make_unique<cpu::CpuGemmLowpOutputStage>();
+    _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info);
+
+    _impl->run_pack = {
+        {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}};
 }
 
-Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status NEGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::UNKNOWN, "NEGEMMLowpQuantizeDownScaleByFixedPoint cannot be used with UNKNOWN output data type.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-
-    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+    return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info);
+}
 
-    switch(info.type)
-    {
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            switch(output->data_type())
-            {
-                case DataType::QASYMM8:
-                    return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QASYMM8_SIGNED:
-                    return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QSYMM16:
-                    return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                default:
-                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
-            }
-        }
-        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
-        {
-            switch(output->data_type())
-            {
-                case DataType::QASYMM8:
-                case DataType::QASYMM8_SIGNED:
-                    return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
-                default:
-                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
-            }
-        }
-        default:
-            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
-    }
+void NEGEMMLowpOutputStage::run()
+{
+    _impl->op->run(_impl->run_pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
deleted file mode 100644
index c7a50a8932..0000000000
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
-
-namespace arm_compute
-{
-void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-Status NEGEMMTranspose1xW::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return NEGEMMTranspose1xWKernel::validate(input, output);
-}
-} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index cad42a3417..62b8cfa48b 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGather.h"
 
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 
 #include <utility>
 
@@ -32,7 +32,8 @@ namespace arm_compute
 {
 void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEGatherKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
+    auto k = std::make_unique<NEGatherKernel>();
     k->configure(input, indices, output, axis);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
deleted file mode 100644
index 399d19ddde..0000000000
--- a/src/runtime/NEON/functions/NEGaussian3x3.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
-
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEGaussian3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
deleted file mode 100644
index 3c7411e2de..0000000000
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
-{
-}
-
-void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    // Init temporary buffer
-    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
-    _tmp.allocator()->init(tensor_info);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-
-    // Create and configure kernels for the two passes
-    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
-
-    _tmp.allocator()->allocate();
-
-    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void NEGaussian5x5::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
deleted file mode 100644
index d08bf1e282..0000000000
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-NEGaussianPyramid::NEGaussianPyramid()
-    : _input(nullptr), _pyramid(nullptr), _tmp()
-{
-}
-
-NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
-    : _horizontal_border_handler(),
-      _vertical_border_handler(),
-      _horizontal_reduction(),
-      _vertical_reduction()
-{
-}
-
-void NEGaussianPyramidHalf::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
-
-    // Constant value to use for vertical fill border when the border mode is CONSTANT
-    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-    const size_t num_stages = num_levels - 1;
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    if(num_levels > 1)
-    {
-        // Apply half scale to the X dimension of the tensor shape
-        TensorShape tensor_shape = pyramid->info()->tensor_shape();
-        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
-
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
-        _tmp.init(pyramid_info);
-
-        _horizontal_reduction.clear();
-        _vertical_reduction.clear();
-        _horizontal_border_handler.clear();
-        _vertical_border_handler.clear();
-
-        _horizontal_reduction.resize(num_stages);
-        _vertical_reduction.resize(num_stages);
-        _horizontal_border_handler.resize(num_stages);
-        _vertical_border_handler.resize(num_stages);
-
-        for(size_t i = 0; i < num_stages; ++i)
-        {
-            /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
-
-            /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
-
-            /* Configure border */
-            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
-
-            /* Configure border */
-            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
-        }
-
-        _tmp.allocate();
-    }
-}
-
-void NEGaussianPyramidHalf::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const unsigned int num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        NEScheduler::get().schedule(&_horizontal_border_handler[i], Window::DimZ);
-        NEScheduler::get().schedule(&_horizontal_reduction[i], Window::DimY);
-        NEScheduler::get().schedule(&_vertical_border_handler[i], Window::DimZ);
-        NEScheduler::get().schedule(&_vertical_reduction[i], Window::DimY);
-    }
-}
-
-NEGaussianPyramidOrb::NEGaussianPyramidOrb() // NOLINT
-    : _gaus5x5(),
-      _scale_nearest()
-{
-}
-
-void NEGaussianPyramidOrb::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = pyramid->info()->num_levels();
-    const size_t num_stages = num_levels - 1;
-
-    _input   = input;
-    _pyramid = pyramid;
-
-    _gaus5x5.clear();
-    _scale_nearest.clear();
-
-    _gaus5x5.resize(num_stages);
-    _scale_nearest.resize(num_stages);
-
-    if(num_levels > 1)
-    {
-        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
-        _tmp.init(pyramid_info);
-
-        for(size_t i = 0; i < num_levels - 1; ++i)
-        {
-            /* Configure gaussian 5x5 */
-            _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
-
-            /* Configure scale */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
-        }
-
-        _tmp.allocate();
-    }
-}
-
-void NEGaussianPyramidOrb::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
-
-    /* Get number of pyramid levels */
-    const size_t num_levels = _pyramid->info()->num_levels();
-
-    /* The first level of the pyramid has the input image */
-    _pyramid->get_pyramid_level(0)->copy_from(*_input);
-
-    for(unsigned int i = 0; i < num_levels - 1; ++i)
-    {
-        _gaus5x5[i].run();
-        _scale_nearest[i].run();
-    }
-}
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 82880bac85..1022b4153e 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,17 +26,23 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+
 namespace arm_compute
 {
 NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
-      _permute_deltas_kernel(),
-      _flatten_deltas_kernel(),
-      _permute_scores_kernel(),
-      _flatten_scores_kernel(),
-      _compute_anchors_kernel(),
-      _bounding_box_kernel(),
-      _pad_kernel(),
+      _permute_deltas(),
+      _flatten_deltas(),
+      _permute_scores(),
+      _flatten_scores(),
+      _compute_anchors(nullptr),
+      _bounding_box(),
+      _pad(),
       _dequantize_anchors(),
       _dequantize_deltas(),
       _quantize_all_proposals(),
@@ -61,46 +67,63 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 {
 }
 
-void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
+
+void NEGenerateProposalsLayer::configure(const ITensor               *scores,
+                                         const ITensor               *deltas,
+                                         const ITensor               *anchors,
+                                         ITensor                     *proposals,
+                                         ITensor                     *scores_out,
+                                         ITensor                     *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>();
+    _compute_anchors->configure(anchors, &_all_anchors,
+                                ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
+        _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+        _flatten_deltas.configure(deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -108,21 +131,21 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1});
+        _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+        _flatten_scores.configure(scores, &_scores_flattened);
     }
 
     Tensor *anchors_to_use = &_all_anchors;
     Tensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -140,16 +163,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    _bounding_box.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
     deltas_to_use->allocator()->allocate();
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -165,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -178,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     _memory_group.manage(&_proposals_4_roi_values);
 
-    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
-    _cpp_nms.configure(&_scores_flattened /*scores_in*/,
-                       _all_proposals_to_use /*boxes_in,*/,
-                       nullptr /* batch_splits_in*/,
-                       scores_out /* scores_out*/,
-                       &_proposals_4_roi_values /*boxes_out*/,
-                       &_classes_nms_unused /*classes*/,
-                       nullptr /*batch_splits_out*/,
-                       &_keeps_nms_unused /*keeps*/,
-                       num_valid_proposals /* keeps_size*/,
-                       box_nms_info);
+    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height());
+    _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/,
+                       nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/,
+                       &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/,
+                       num_valid_proposals /* keeps_size*/, box_nms_info);
 
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
@@ -196,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status NEGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -209,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -220,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
-
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
+
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                             BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -302,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -318,36 +370,36 @@ void NEGenerateProposalsLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
-    NEScheduler::get().schedule(&_compute_anchors_kernel, Window::DimY);
+    NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
-        NEScheduler::get().schedule(&_permute_deltas_kernel, Window::DimY);
-        NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
+        _permute_deltas.run();
+        _permute_scores.run();
     }
 
-    NEScheduler::get().schedule(&_flatten_deltas_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_flatten_scores_kernel, Window::DimY);
+    _flatten_deltas.run();
+    _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
-        NEScheduler::get().schedule(&_dequantize_anchors, Window::DimY);
-        NEScheduler::get().schedule(&_dequantize_deltas, Window::DimY);
+        _dequantize_anchors.run();
+        _dequantize_deltas.run();
     }
 
     // Build the boxes
-    NEScheduler::get().schedule(&_bounding_box_kernel, Window::DimY);
+    _bounding_box.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
-        NEScheduler::get().schedule(&_quantize_all_proposals, Window::DimY);
+        _quantize_all_proposals.run();
     }
 
     // Non maxima suppression
     _cpp_nms.run();
 
     // Add dummy batch indexes
-    NEScheduler::get().schedule(&_pad_kernel, Window::DimY);
+    _pad.run();
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
deleted file mode 100644
index 8efc091d0a..0000000000
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
-{
-}
-
-void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    ARM_COMPUTE_ERROR_ON(nullptr == hog);
-
-    const HOGInfo *hog_info = hog->info();
-    const size_t   width    = input->info()->dimension(Window::DimX);
-    const size_t   height   = input->info()->dimension(Window::DimY);
-    const size_t   num_bins = hog_info->num_bins();
-
-    Size2D cell_size = hog_info->cell_size();
-
-    // Calculate number of cells along the x and y directions for the hog_space
-    const size_t num_cells_x = width / cell_size.width;
-    const size_t num_cells_y = height / cell_size.height;
-
-    // TensorShape of the input image
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // TensorShape of the hog space
-    TensorShape shape_hog_space = input->info()->tensor_shape();
-    shape_hog_space.set(Window::DimX, num_cells_x);
-    shape_hog_space.set(Window::DimY, num_cells_y);
-
-    // Allocate memory for magnitude, phase and hog space
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-    _hog_space.allocator()->init(info_space);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_hog_space);
-
-    // Initialise orientation binning kernel
-    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
-
-    // Initialize HOG norm kernel
-    _block_norm.configure(&_hog_space, output, hog->info());
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _hog_space.allocator()->allocate();
-}
-
-void NEHOGDescriptor::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run gradient
-    _gradient.run();
-
-    // Run orientation binning kernel
-    NEScheduler::get().schedule(&_orient_bin, Window::DimY);
-
-    // Run block normalization kernel
-    NEScheduler::get().schedule(&_block_norm, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
deleted file mode 100644
index 95d7aae931..0000000000
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
-
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEHOGDetectorKernel>();
-    k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
deleted file mode 100644
index 27606327e3..0000000000
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
-
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _derivative(),
-      _mag_phase(nullptr),
-      _gx(),
-      _gy()
-{
-}
-
-void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
-
-    const TensorShape &shape_img = input->info()->tensor_shape();
-
-    // Allocate image memory
-    TensorInfo info(shape_img, Format::S16);
-    _gx.allocator()->init(info);
-    _gy.allocator()->init(info);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    // Initialise derivate kernel
-    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
-
-    // Initialise magnitude/phase kernel
-    if(PhaseType::UNSIGNED == phase_type)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
-        k->configure(&_gx, &_gy, output_magnitude, output_phase);
-        _mag_phase = std::move(k);
-    }
-    else
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-        k->configure(&_gx, &_gy, output_magnitude, output_phase);
-        _mag_phase = std::move(k);
-    }
-
-    // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-}
-
-void NEHOGGradient::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run derivative
-    _derivative.run();
-
-    // Run magnitude/phase kernel
-    NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
deleted file mode 100644
index 572e427b6e..0000000000
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-
-using namespace arm_compute;
-
-NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _gradient_kernel(),
-      _orient_bin_kernel(),
-      _block_norm_kernel(),
-      _hog_detect_kernel(),
-      _non_maxima_kernel(),
-      _hog_space(),
-      _hog_norm_space(),
-      _detection_windows(),
-      _mag(),
-      _phase(),
-      _non_maxima_suppression(false),
-      _num_orient_bin_kernel(0),
-      _num_block_norm_kernel(0),
-      _num_hog_detect_kernel(0)
-{
-}
-
-void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode,
-                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
-    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
-    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
-
-    const size_t       width      = input->info()->dimension(Window::DimX);
-    const size_t       height     = input->info()->dimension(Window::DimY);
-    const TensorShape &shape_img  = input->info()->tensor_shape();
-    const size_t       num_models = multi_hog->num_models();
-    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
-
-    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
-    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
-    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
-    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
-
-    /* Check if NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
-     *
-     * 1) NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
-     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     * 2) NEHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
-     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
-     *
-     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
-     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
-     */
-    std::vector<size_t> input_orient_bin;
-    std::vector<size_t> input_hog_detect;
-    std::vector<std::pair<size_t, size_t>> input_block_norm;
-
-    input_orient_bin.push_back(0);
-    input_hog_detect.push_back(0);
-    input_block_norm.emplace_back(0, 0);
-
-    for(size_t i = 1; i < num_models; ++i)
-    {
-        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
-        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
-        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
-        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
-
-        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
-        {
-            prev_num_bins     = cur_num_bins;
-            prev_cell_size    = cur_cell_size;
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute orientation binning and block normalization kernels. Update input to process
-            input_orient_bin.push_back(i);
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
-                || (cur_block_stride.height != prev_block_stride.height))
-        {
-            prev_block_size   = cur_block_size;
-            prev_block_stride = cur_block_stride;
-
-            // Compute block normalization kernel. Update input to process
-            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
-        }
-
-        // Update input to process for hog detector kernel
-        input_hog_detect.push_back(input_block_norm.size() - 1);
-    }
-
-    _detection_windows      = detection_windows;
-    _non_maxima_suppression = non_maxima_suppression;
-    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of NEHOGOrientationBinningKernel kernels to compute
-    _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
-    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
-
-    _orient_bin_kernel.clear();
-    _block_norm_kernel.clear();
-    _hog_detect_kernel.clear();
-    _hog_space.clear();
-    _hog_norm_space.clear();
-
-    _orient_bin_kernel.resize(_num_orient_bin_kernel);
-    _block_norm_kernel.resize(_num_block_norm_kernel);
-    _hog_detect_kernel.resize(_num_hog_detect_kernel);
-    _hog_space.resize(_num_orient_bin_kernel);
-    _hog_norm_space.resize(_num_block_norm_kernel);
-    _non_maxima_kernel = CPPDetectionWindowNonMaximaSuppressionKernel();
-
-    // Allocate tensors for magnitude and phase
-    TensorInfo info_mag(shape_img, Format::S16);
-    _mag.allocator()->init(info_mag);
-
-    TensorInfo info_phase(shape_img, Format::U8);
-    _phase.allocator()->init(info_phase);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_mag);
-    _memory_group.manage(&_phase);
-
-    // Initialise gradient kernel
-    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
-
-    // Configure NETensor for the HOG space and orientation binning kernel
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        const size_t idx_multi_hog = input_orient_bin[i];
-
-        // Get the corresponding cell size and number of bins
-        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
-        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
-
-        // Calculate number of cells along the x and y directions for the hog_space
-        const size_t num_cells_x = width / cell.width;
-        const size_t num_cells_y = height / cell.height;
-
-        // TensorShape of hog space
-        TensorShape shape_hog_space = input->info()->tensor_shape();
-        shape_hog_space.set(Window::DimX, num_cells_x);
-        shape_hog_space.set(Window::DimY, num_cells_y);
-
-        // Allocate HOG space
-        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
-        _hog_space[i].allocator()->init(info_space);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_space[i]);
-
-        // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    // Configure NETensor for the normalized HOG space and block normalization kernel
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        const size_t idx_multi_hog  = input_block_norm[i].first;
-        const size_t idx_orient_bin = input_block_norm[i].second;
-
-        // Allocate normalized HOG space
-        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
-        _hog_norm_space[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_hog_norm_space[i]);
-
-        // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
-    }
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
-    // Configure HOG detector kernel
-    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
-    {
-        const size_t idx_block_norm = input_hog_detect[i];
-
-        _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
-    }
-
-    // Configure non maxima suppression kernel
-    _non_maxima_kernel.configure(_detection_windows, min_distance);
-
-    // Allocate intermediate tensors
-    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
-    {
-        _hog_norm_space[i].allocator()->allocate();
-    }
-}
-
-void NEHOGMultiDetection::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Reset detection window
-    _detection_windows->clear();
-
-    // Run gradient
-    _gradient_kernel.run();
-
-    // Run orientation binning kernel
-    for(auto &kernel : _orient_bin_kernel)
-    {
-        NEScheduler::get().schedule(&kernel, Window::DimY);
-    }
-
-    // Run block normalization kernel
-    for(auto &kernel : _block_norm_kernel)
-    {
-        NEScheduler::get().schedule(&kernel, Window::DimY);
-    }
-
-    // Run HOG detector kernel
-    for(auto &kernel : _hog_detect_kernel)
-    {
-        kernel.run();
-    }
-
-    // Run non-maxima suppression kernel if enabled
-    if(_non_maxima_suppression)
-    {
-        NEScheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
deleted file mode 100644
index bf1e27114c..0000000000
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
-#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
-#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/MemorySupport.h"
-
-#include <cmath>
-#include <utility>
-
-using namespace arm_compute;
-
-NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _sobel(),
-      _harris_score(),
-      _non_max_suppr(),
-      _candidates(),
-      _sort_euclidean(),
-      _border_gx(),
-      _border_gy(),
-      _gx(),
-      _gy(),
-      _score(),
-      _nonmax(),
-      _corners_list(),
-      _num_corner_candidates(0)
-{
-}
-
-void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
-                                float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
-
-    const TensorShape shape = input->info()->tensor_shape();
-    TensorInfo        tensor_info_gxgy;
-
-    if(gradient_size < 7)
-    {
-        tensor_info_gxgy.init(shape, Format::S16);
-    }
-    else
-    {
-        tensor_info_gxgy.init(shape, Format::S32);
-    }
-
-    _gx.allocator()->init(tensor_info_gxgy);
-    _gy.allocator()->init(tensor_info_gxgy);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_gx);
-    _memory_group.manage(&_gy);
-
-    TensorInfo tensor_info_score(shape, Format::F32);
-    _score.allocator()->init(tensor_info_score);
-    _nonmax.allocator()->init(tensor_info_score);
-
-    _corners_list.resize(shape.x() * shape.y());
-
-    // Set/init Sobel kernel accordingly with gradient_size
-    switch(gradient_size)
-    {
-        case 3:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NESobel3x3>();
-            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 5:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NESobel5x5>();
-            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        case 7:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NESobel7x7>();
-            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
-            _sobel = std::move(k);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Gradient size not implemented");
-    }
-
-    // Normalization factor
-    const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_score);
-
-    // Set/init Harris Score kernel accordingly with block_size
-    switch(block_size)
-    {
-        case 3:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
-            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-            _harris_score = std::move(k);
-        }
-        break;
-        case 5:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
-            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-            _harris_score = std::move(k);
-        }
-        break;
-        case 7:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
-            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-            _harris_score = std::move(k);
-        }
-        default:
-            break;
-    }
-
-    // Configure border filling before harris score
-    _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
-    _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
-
-    // Allocate once all the configure methods have been called
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_nonmax);
-
-    // Init non-maxima suppression function
-    _non_max_suppr.configure(&_score, &_nonmax, border_mode);
-
-    // Allocate once all the configure methods have been called
-    _score.allocator()->allocate();
-
-    // Init corner candidates kernel
-    _candidates.configure(&_nonmax, _corners_list.data(), &_num_corner_candidates);
-
-    // Allocate once all the configure methods have been called
-    _nonmax.allocator()->allocate();
-
-    // Init euclidean distance
-    _sort_euclidean.configure(_corners_list.data(), corners, &_num_corner_candidates, min_dist);
-}
-
-void NEHarrisCorners::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Init to 0 number of corner candidates
-    _num_corner_candidates = 0;
-
-    // Run Sobel kernel
-    _sobel->run();
-
-    // Fill border before harris score kernel
-    NEScheduler::get().schedule(&_border_gx, Window::DimZ);
-    NEScheduler::get().schedule(&_border_gy, Window::DimZ);
-
-    // Run harris score kernel
-    NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
-
-    // Run non-maxima suppression
-    _non_max_suppr.run();
-
-    // Run corner candidate kernel
-    NEScheduler::get().schedule(&_candidates, Window::DimY);
-
-    // Run sort & euclidean distance
-    NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
deleted file mode 100644
index 6a672ed915..0000000000
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IDistribution1D.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEHistogram::NEHistogram()
-    : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0)
-{
-}
-
-void NEHistogram::configure(const IImage *input, IDistribution1D *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    // Allocate space for threads local histograms
-    _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
-    _local_hist.resize(_local_hist_size);
-
-    // Configure kernel
-    _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data());
-}
-
-void NEHistogram::run()
-{
-    // Calculate histogram of input.
-    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index d7cb7de627..78218cbdee 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,40 @@
 #include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
 
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+
 namespace arm_compute
 {
+NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
+
 NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+    : _memory_group(std::move(memory_manager)),
+      _normalization_kernel(),
+      _is_nchw(false),
+      _permute_input(),
+      _permute_output(),
+      _permuted_input(),
+      _permuted_output()
 {
 }
 
 void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
 {
-    const DataLayout data_layout = input->info()->data_layout();
+    ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon);
+
+    const DataLayout data_layout       = input->info()->data_layout();
+    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true};
 
     // Configure Kernels
     _is_nchw = data_layout == DataLayout::NCHW;
 
-    if(!_is_nchw)
+    _normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>();
+
+    if (!_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -49,7 +66,7 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
         _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
         _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-        _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+        _normalization_kernel->configure(&_permuted_input, &_permuted_output, kernel_descriptor);
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
 
         _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U));
@@ -58,13 +75,16 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
     }
     else
     {
-        _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+        _normalization_kernel->configure(input, output, kernel_descriptor);
     }
 }
 
-Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+Status NEInstanceNormalizationLayer::validate(
+    const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
 {
-    return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+    return NEInstanceNormalizationLayerKernel::validate(
+        &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW),
+        InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true});
 }
 
 void NEInstanceNormalizationLayer::run()
@@ -72,15 +92,15 @@ void NEInstanceNormalizationLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Permute input
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_input.run();
     }
 
-    NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+    NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
 
     // Permute output
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_output.run();
     }
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
deleted file mode 100644
index 845f3b0936..0000000000
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
-
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
-#include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEIntegralImage::configure(const ITensor *input, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 88ffdbfd08..b7f6203efd 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,17 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+
 namespace arm_compute
 {
 namespace
 {
 constexpr int max_input_tensor_dim = 3;
 } // namespace
+NEL2NormalizeLayer::~NEL2NormalizeLayer() = default;
 
 NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
@@ -40,13 +45,16 @@ NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_ma
 
 void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, int axis, float epsilon)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
+
     // Manage intermediate buffers
     _memory_group.manage(&_sumsq);
 
     // Configure Kernels
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
     _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+    _normalize_kernel = std::make_unique<NEL2NormalizeLayerKernel>();
+    _normalize_kernel->configure(input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensors
     _sumsq.allocator()->allocate();
@@ -62,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);
@@ -78,6 +87,6 @@ void NEL2NormalizeLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
-    NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+    NEScheduler::get().schedule(_normalize_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index f9d445fe71..1a08cdeb06 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,47 +24,138 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::info_helpers;
 
+NELSTMLayer::~NELSTMLayer() = default;
+
 NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
-      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
-      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
-      _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
-      _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
-      _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
-      _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
-      _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
       _is_layer_norm_lstm(false)
 {
 }
 
-void NELSTMLayer::configure(const ITensor *input,
-                            const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                            const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                            const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                            const ITensor *output_state_in, const ITensor *cell_state_in,
-                            ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                            const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void NELSTMLayer::configure(const ITensor             *input,
+                            const ITensor             *input_to_forget_weights,
+                            const ITensor             *input_to_cell_weights,
+                            const ITensor             *input_to_output_weights,
+                            const ITensor             *recurrent_to_forget_weights,
+                            const ITensor             *recurrent_to_cell_weights,
+                            const ITensor             *recurrent_to_output_weights,
+                            const ITensor             *forget_gate_bias,
+                            const ITensor             *cell_bias,
+                            const ITensor             *output_gate_bias,
+                            const ITensor             *output_state_in,
+                            const ITensor             *cell_state_in,
+                            ITensor                   *scratch_buffer,
+                            ITensor                   *output_state_out,
+                            ITensor                   *cell_state_out,
+                            ITensor                   *output,
+                            const LSTMParams<ITensor> &lstm_params,
+                            const ActivationLayerInfo &activation_info,
+                            float                      cell_threshold,
+                            float                      projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -73,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input,
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
 
@@ -106,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input,
     _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     Tensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1,
+                                             ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -128,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input,
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(),
+                                                   &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -151,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input,
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     Tensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -173,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input,
         _memory_group.manage(&_input_gate_out1);
         _memory_group.manage(&_input_gate_out4);
 
-        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
         input_gate_out = &_input_gate_out3;
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -191,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input,
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(),
+                                                      &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                      RoundingPolicy::TO_ZERO);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -218,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input,
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias,
+                                          &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
@@ -227,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input,
     _memory_group.manage(&_cell_state_out4);
     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     Tensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(),
+                                                 &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                 RoundingPolicy::TO_ZERO);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+        _cell_clip.configure(&_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -271,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input,
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias,
+                                      &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     Tensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1,
+                                               ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
         _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -294,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input,
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(),
+                                                   &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -325,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1,
+                                           ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_activation.allocator()->allocate();
     output_gate_out->allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -347,8 +470,8 @@ void NELSTMLayer::configure(const ITensor *input,
     _copy_output.configure(output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
-    std::vector<ITensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    std::vector<const ITensor *> scratch_inputs;
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -362,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input,
     output_gate_out->allocator()->allocate();
 }
 
-Status NELSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status NELSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -403,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -424,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -435,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -455,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -489,98 +631,130 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
-    }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
-                                                                                                                    cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     // Validate output gate tmp
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(),
+                                                &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    if(lstm_params.has_projection())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    if (lstm_params.has_projection())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(output_state_out, output_state_out,
-                                                                          ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
     // Validate copy kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(&cell_state_tmp, cell_state_out));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(&cell_state_tmp, cell_state_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
 
     // Validate scratch concatenation
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -601,108 +775,111 @@ void NELSTMLayer::run()
     _concat_inputs_forget_gate.run();
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY);
+        _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY);
-        NEScheduler::get().schedule(&_accum_forget_gate_bias, Window::DimY);
+        _pixelwise_mul_forget_gate_coeff.run();
+        _accum_forget_gate_bias.run();
     }
-    NEScheduler::get().schedule(&_activation_forget_gate, Window::DimY);
+    _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
-        if(_ones.info()->data_type() == DataType::F16)
+        if (_ones.info()->data_type() == DataType::F16)
         {
-            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<half *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
         else
         {
-            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<float *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
-        NEScheduler::get().schedule(&_subtract_input_gate, Window::DimY);
+        _subtract_input_gate.run();
     }
     else
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
-            NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
+            _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
-            NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY);
-            NEScheduler::get().schedule(&_accum_input_gate_bias, Window::DimY);
+            _pixelwise_mul_input_gate_coeff.run();
+            _accum_input_gate_bias.run();
         }
-        NEScheduler::get().schedule(&_activation_input_gate, Window::DimY);
+        _activation_input_gate.run();
     }
 
     _fully_connected_cell_state.run();
-    NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY);
+    _transpose_cell_state.run();
     _gemm_cell_state1.run();
-    NEScheduler::get().schedule(&_accum_cell_state1, Window::DimY);
-    if(_is_layer_norm_lstm)
+    _accum_cell_state1.run();
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY);
-        NEScheduler::get().schedule(&_accum_cell_gate_bias, Window::DimY);
+        _pixelwise_mul_cell_gate_coeff.run();
+        _accum_cell_gate_bias.run();
     }
-    NEScheduler::get().schedule(&_activation_cell_state, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY);
-    NEScheduler::get().schedule(&_accum_cell_state2, Window::DimY);
 
-    if(_perform_cell_clipping)
+    _activation_cell_state.run();
+    _pixelwise_mul_cell_state1.run();
+    _pixelwise_mul_cell_state2.run();
+    _accum_cell_state2.run();
+
+    if (_perform_cell_clipping)
     {
-        NEScheduler::get().schedule(&_cell_clip, Window::DimY);
+        _cell_clip.run();
     }
 
     _fully_connected_output.run();
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
+        _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY);
-        NEScheduler::get().schedule(&_accum_output_gate_bias, Window::DimY);
+        _pixelwise_mul_output_gate_coeff.run();
+        _accum_output_gate_bias.run();
     }
-    NEScheduler::get().schedule(&_activation_output, Window::DimY);
+    _activation_output.run();
 
-    NEScheduler::get().schedule(&_activation_output_state, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY);
+    _activation_output_state.run();
+    _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
-            NEScheduler::get().schedule(&_projection_clip, Window::DimY);
+            _projection_clip.run();
         }
     }
 
-    NEScheduler::get().schedule(&_copy_cell_state, Window::DimY);
-    NEScheduler::get().schedule(&_copy_output, Window::DimY);
+    _copy_cell_state.run();
+    _copy_output.run();
 
     _concat_scratch_buffer.run();
 }
 
 void NELSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index cdfc035400..41f9c3d700 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,11 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cmath>
 #include <memory>
@@ -41,34 +44,107 @@ const QuantizationInfo qsymm_3(8.f / 32768.f, 0);  // qsymm16 with 3 integer bit
 const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit
 const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 } // namespace
+NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
 
 NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(),
-      _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr),
-      _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr),
-      _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(),
-      _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(),
-      _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(),
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add1(),
+      _add2(),
+      _mul1(),
+      _mul2(),
+      _mul3(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state1(),
+      _cell_state2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
       _is_prepared(false)
 {
 }
 
 void NELSTMLayerQuantized::configure(const ITensor *input,
-                                     const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                                     const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                                     const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                                     ITensor *cell_state_in, const ITensor *output_state_in,
-                                     ITensor *cell_state_out, ITensor *output_state_out)
+                                     const ITensor *input_to_input_weights,
+                                     const ITensor *input_to_forget_weights,
+                                     const ITensor *input_to_cell_weights,
+                                     const ITensor *input_to_output_weights,
+                                     const ITensor *recurrent_to_input_weights,
+                                     const ITensor *recurrent_to_forget_weights,
+                                     const ITensor *recurrent_to_cell_weights,
+                                     const ITensor *recurrent_to_output_weights,
+                                     const ITensor *input_gate_bias,
+                                     const ITensor *forget_gate_bias,
+                                     const ITensor *cell_bias,
+                                     const ITensor *output_gate_bias,
+                                     ITensor       *cell_state_in,
+                                     const ITensor *output_state_in,
+                                     ITensor       *cell_state_out,
+                                     ITensor       *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
+
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+                           output_state_out);
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -76,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -93,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
     _output_gate_bias            = output_gate_bias;
 
     // Weights concatenation
-    std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights };
-    std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights };
+    std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights,
+                                                       input_to_cell_weights, input_to_output_weights};
+    std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                          recurrent_to_cell_weights, recurrent_to_output_weights};
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
-    std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights };
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights};
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(&_weights, &_weights_transposed);
 
     // Input concatenation
-    std::vector<const ITensor *> input_vector{ input, output_state_in };
+    std::vector<const ITensor *> input_vector{input, output_state_in};
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(input_vector, &_input, Window::DimX);
 
     // Bias concatenation
-    std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias };
+    std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias};
     _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
     _concat_bias.configure(bias_vector, &_bias, Window::DimX);
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -130,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -141,69 +227,91 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
     quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     _memory_group.manage(&_output_lowp);
-    _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+
+    GEMMLowpOutputStageInfo info;
+    info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    info.gemmlowp_multiplier = output_multiplier;
+    info.gemmlowp_shift      = output_shift;
+    info.output_data_type    = DataType::QSYMM16;
+    _output_stage.configure(&_output_highp, &_bias, &_output_lowp, info);
     _output_highp.allocator()->allocate();
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0},
+                                     {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state1);
-    _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state2);
-    _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
@@ -213,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -233,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 }
 
 Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
 
     const int input_size  = input->dimension(0);
     const int batch_size  = input->dimension(1);
@@ -253,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights,
+                                                              input_to_forget_weights, input_to_cell_weights,
+                                                              input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -297,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -307,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -333,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -344,78 +494,107 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int32_t     output_multiplier = 0;
     int32_t     output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+    GEMMLowpOutputStageInfo info;
+    info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    info.gemmlowp_multiplier = output_multiplier;
+    info.gemmlowp_shift      = output_shift;
+    info.output_data_type    = DataType::QSYMM16;
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info));
 
     TensorInfo input_gate_input;
     TensorInfo forget_gate_input;
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -424,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -490,7 +669,7 @@ void NELSTMLayerQuantized::run()
 
 void NELSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
deleted file mode 100644
index 6b37029f0e..0000000000
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-
-using namespace arm_compute;
-
-NELaplacianPyramid::NELaplacianPyramid() // NOLINT
-    : _num_levels(0),
-      _gaussian_pyr_function(),
-      _convf(),
-      _subf(),
-      _gauss_pyr(),
-      _conv_pyr(),
-      _depth_function()
-{
-}
-
-void NELaplacianPyramid::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
-
-    // Compute Gaussian Pyramid
-    _gaussian_pyr_function.run();
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        // Apply Gaussian filter to gaussian pyramid image
-        _convf[i].run();
-    }
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        // Compute laplacian image
-        _subf[i].run();
-    }
-
-    _depth_function.run();
-}
-
-void NELaplacianPyramid::configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    _num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the gaussian pyramid and the convoluted pyramid
-    PyramidInfo pyramid_info;
-    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
-
-    _gauss_pyr.init(pyramid_info);
-    _conv_pyr.init(pyramid_info);
-
-    // Create Gaussian Pyramid function
-    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
-
-    _convf.resize(_num_levels);
-    _subf.resize(_num_levels);
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
-        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
-    }
-
-    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
-
-    _gauss_pyr.allocate();
-    _conv_pyr.allocate();
-}
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
deleted file mode 100644
index 9f7588edb0..0000000000
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-NELaplacianReconstruct::NELaplacianReconstruct() // NOLINT
-    : _tmp_pyr(),
-      _addf(),
-      _scalef(),
-      _depthf()
-{
-}
-
-void NELaplacianReconstruct::configure(const IPyramid *pyramid, ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
-
-    const size_t num_levels = pyramid->info()->num_levels();
-
-    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
-    PyramidInfo pyramid_info;
-    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
-
-    _tmp_pyr.init(pyramid_info);
-
-    // Allocate add and scale functions. Level 0 does not need to be scaled.
-    _addf.resize(num_levels);
-    _scalef.resize(num_levels - 1);
-
-    const size_t last_level = num_levels - 1;
-
-    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
-
-    // Scale levels n-1 to 1, and add levels n-2 to 0
-    for(size_t l = 0; l < last_level; ++l)
-    {
-        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
-        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
-    }
-
-    // Convert level 0 from S16 to U8
-    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
-
-    _tmp_pyr.allocate();
-}
-
-void NELaplacianReconstruct::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_addf.empty(), "Unconfigured function");
-
-    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
-
-    _addf[last_level].run();
-
-    // Run l = [last_level - 1, 0]
-    for(size_t l = last_level; l-- > 0;)
-    {
-        _scalef[l].run();
-        _addf[l].run();
-    }
-
-    _depthf.run();
-}
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
deleted file mode 100644
index d08202dd24..0000000000
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <cmath>
-#include <tuple>
-
-using namespace arm_compute;
-
-namespace
-{
-void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                      TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
-{
-    ARM_COMPUTE_UNUSED(output);
-
-    const unsigned int kernel_width  = weights->dimension(0);
-    const unsigned int kernel_height = weights->dimension(1);
-
-    bool has_bias = (biases != nullptr);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    const size_t mat_weights_cols = weights->dimension(3);
-    const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
-    const size_t mat_weights_num  = weights->dimension(4);
-
-    shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
-
-    const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = conv_w * conv_h;
-
-    shape_im2col = input->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-
-    shape_gemm = shape_im2col;
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-}
-} // namespace
-
-NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_prepared(false), _original_weights(nullptr)
-{
-}
-
-Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
-
-    bool has_bias = (biases != nullptr);
-
-    if(has_bias)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
-    }
-
-    const unsigned int kernel_width  = weights->dimension(0);
-    const unsigned int kernel_height = weights->dimension(1);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
-
-    // Calculate intermediate buffer shapes
-    TensorShape shape_wr;
-    TensorShape shape_im2col;
-    TensorShape shape_gemm;
-    calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
-
-    TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
-    TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
-    TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
-
-    return Status{};
-}
-
-void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
-
-    bool _has_bias    = (biases != nullptr);
-    _is_prepared      = false;
-    _original_weights = weights;
-
-    const unsigned int kernel_width  = weights->info()->dimension(0);
-    const unsigned int kernel_height = weights->info()->dimension(1);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    // Calculate intermediate buffer shapes
-    TensorShape shape_wr;
-    TensorShape shape_im2col;
-    TensorShape shape_gemm;
-    calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
-
-    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_input_im2col_reshaped);
-    _memory_group.manage(&_gemm_output);
-
-    // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
-
-    // Allocate intermediate tensors
-    _input_im2col_reshaped.allocator()->allocate();
-    _gemm_output.allocator()->allocate();
-}
-
-void NELocallyConnectedLayer::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run input reshaping
-    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
-
-    // Runs GEMM on reshaped matrices
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
-
-    // Reshape output matrix
-    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
-}
-
-void NELocallyConnectedLayer::prepare()
-{
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights reshaping and mark original weights tensor as unused
-        _weights_reshaped.allocator()->allocate();
-        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-        _original_weights->mark_as_unused();
-
-        _is_prepared = true;
-    }
-}
diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
new file mode 100644
index 0000000000..0013a521d1
--- /dev/null
+++ b/src/runtime/NEON/functions/NELogical.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NELogicalKernel.h"
+
+namespace arm_compute
+{
+struct LogicalArgs
+{
+    std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr};
+    ITensorPack                               pack{};
+};
+
+struct NELogicalAnd::Impl : public LogicalArgs
+{
+};
+NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>())
+{
+}
+NELogicalAnd::~NELogicalAnd() = default;
+
+void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+
+    _impl->kernel = std::make_unique<kernels::NELogicalKernel>();
+    _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::And);
+
+    _impl->pack = ITensorPack();
+    _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1);
+    _impl->pack.add_tensor(TensorType::ACL_SRC_1, input2);
+    _impl->pack.add_tensor(TensorType::ACL_DST, output);
+}
+
+Status NELogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::NELogicalKernel::validate(input1, input2, output, LogicalOperation::And);
+}
+
+void NELogicalAnd::run()
+{
+    NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->kernel->window(), _impl->pack);
+}
+
+struct NELogicalOr::Impl : public LogicalArgs
+{
+};
+NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>())
+{
+}
+NELogicalOr::~NELogicalOr() = default;
+
+void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
+
+    _impl->kernel = std::make_unique<kernels::NELogicalKernel>();
+    _impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::Or);
+
+    _impl->pack = ITensorPack();
+    _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1);
+    _impl->pack.add_tensor(TensorType::ACL_SRC_1, input2);
+    _impl->pack.add_tensor(TensorType::ACL_DST, output);
+}
+
+Status NELogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::NELogicalKernel::validate(input1, input2, output, LogicalOperation::Or);
+}
+
+void NELogicalOr::run()
+{
+    NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->kernel->window(), _impl->pack);
+}
+
+struct NELogicalNot::Impl : public LogicalArgs
+{
+};
+NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>())
+{
+}
+NELogicalNot::~NELogicalNot() = default;
+
+void NELogicalNot::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output);
+
+    _impl->kernel = std::make_unique<kernels::NELogicalKernel>();
+    _impl->kernel->configure(input->info(), nullptr, output->info(), LogicalOperation::Not);
+
+    _impl->pack = ITensorPack();
+    _impl->pack.add_tensor(TensorType::ACL_SRC_0, input);
+    _impl->pack.add_tensor(TensorType::ACL_DST, output);
+}
+
+Status NELogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return kernels::NELogicalKernel::validate(input, nullptr, output, LogicalOperation::Not);
+}
+
+void NELogicalNot::run()
+{
+    NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->kernel->window(), _impl->pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
deleted file mode 100644
index ff2cd49495..0000000000
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
-
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
-#include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type)
-{
-    if(mag_type == MagnitudeType::L1NORM)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
-        k->configure(input1, input2, output, nullptr);
-        _kernel = std::move(k);
-    }
-    else
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-        k->configure(input1, input2, output, nullptr);
-        _kernel = std::move(k);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
new file mode 100644
index 0000000000..31898bafc4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuMatMul.h"
+
+namespace arm_compute
+{
+struct NEMatMul::Impl
+{
+    const ITensor                  *lhs{nullptr};
+    const ITensor                  *rhs{nullptr};
+    ITensor                        *output{nullptr};
+    std::unique_ptr<cpu::CpuMatMul> op{nullptr};
+    MemoryGroup                     memory_group{};
+    WorkspaceData<Tensor>           workspace_tensors{};
+    ITensorPack                     run_pack{};
+};
+
+NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>())
+{
+}
+
+NEMatMul::~NEMatMul() = default;
+
+void NEMatMul::configure(ITensor                   *lhs,
+                         ITensor                   *rhs,
+                         ITensor                   *output,
+                         const MatMulInfo          &info,
+                         const CpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
+{
+    _impl->lhs    = lhs;
+    _impl->rhs    = rhs;
+    _impl->output = output;
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
+    _impl->op = std::make_unique<cpu::CpuMatMul>();
+    _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
+    _impl->run_pack          = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status NEMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
+{
+    return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
+}
+
+void NEMatMul::run()
+{
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
new file mode 100644
index 0000000000..c3861afd2c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
+#include "src/cpu/operators/CpuMaxUnpooling.h"
+
+namespace arm_compute
+{
+struct NEMaxUnpoolingLayer::Impl
+{
+    const ITensor                        *src{nullptr};
+    const ITensor                        *indices{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr};
+};
+
+NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
+
+NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl()
+{
+}
+
+void NEMaxUnpoolingLayer::configure(ITensor                *input,
+                                    ITensor                *indices,
+                                    ITensor                *output,
+                                    const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
+
+    const PixelValue zero_value(0.f);
+    _fill_func     = std::make_unique<NEFill>();
+    _impl          = std::make_unique<Impl>();
+    _impl->src     = input;
+    _impl->indices = indices;
+    _impl->dst     = output;
+
+    _impl->op = std::make_unique<cpu::CpuMaxUnpooling>();
+    _fill_func->configure(output, zero_value);
+    _impl->op->configure(input->info(), indices->info(), output->info(), pool_info);
+}
+
+Status NEMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info));
+    return Status{};
+}
+
+void NEMaxUnpoolingLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->indices);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _fill_func->run();
+    _impl->op->run(pack);
+}
+} /* namespace arm_compute */
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
deleted file mode 100644
index 2304bc80d7..0000000000
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEMeanStdDev::NEMeanStdDev()
-    : _mean_stddev_kernel(), _fill_border_kernel(), _global_sum(0), _global_sum_squared(0)
-{
-}
-
-void NEMeanStdDev::configure(IImage *input, float *mean, float *stddev)
-{
-    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
-    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
-}
-
-void NEMeanStdDev::run()
-{
-    _global_sum         = 0;
-    _global_sum_squared = 0;
-
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ);
-    NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index fdf2980961..dec0dde56d 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,18 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
+NEMeanStdDevNormalizationLayer::~NEMeanStdDevNormalizationLayer() = default;
+
 void NEMeanStdDevNormalizationLayer::configure(ITensor *input, ITensor *output, float epsilon)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEMeanStdDevNormalizationKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
+
+    auto k = std::make_unique<NEMeanStdDevNormalizationKernel>();
     k->configure(input, output, epsilon);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
deleted file mode 100644
index e24023cf3a..0000000000
--- a/src/runtime/NEON/functions/NEMedian3x3.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
-
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEMedian3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
deleted file mode 100644
index 54e89abe24..0000000000
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEMinMaxLocation::NEMinMaxLocation()
-    : _min_max(), _min_max_loc()
-{
-}
-
-void NEMinMaxLocation::configure(const IImage *input, void *min, void *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
-{
-    _min_max.configure(input, min, max);
-    _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
-}
-
-void NEMinMaxLocation::run()
-{
-    _min_max.reset();
-
-    /* Run min max kernel */
-    NEScheduler::get().schedule(&_min_max, Window::DimY);
-
-    /* Run min max location */
-    NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
deleted file mode 100644
index 6875d2e5d5..0000000000
--- a/src/runtime/NEON/functions/NENonLinearFilter.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
-
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                  BorderMode border_mode,
-                                  uint8_t    constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NENonLinearFilterKernel>();
-    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
deleted file mode 100644
index 5c1c3d29ad..0000000000
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
-
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-
-    if(border_mode != BorderMode::UNDEFINED)
-    {
-        _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
-    }
-    else
-    {
-        _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
-    }
-}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index d52e92828e..d6d2e9dc46 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,16 +30,22 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
+
+namespace arm_compute
+{
+NENormalizationLayer::~NENormalizationLayer() = default;
 
 NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared()
 {
 }
 
 void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
 
     TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
     _input_squared.allocator()->init(tensor_info);
@@ -48,21 +54,24 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
     _memory_group.manage(&_input_squared);
 
     // Configure kernels
-    _norm_kernel.configure(input, &_input_squared, output, norm_info);
-    _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-    _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0.0f));
+    _norm_kernel = std::make_unique<NENormalizationLayerKernel>();
+    _norm_kernel->configure(input, &_input_squared, output, norm_info);
+    _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
 
     // Allocate the tensor once the configure methods have been called
     _input_squared.allocator()->allocate();
 }
 
-Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status NENormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     return Status{};
 }
@@ -70,8 +79,7 @@ Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInf
 void NENormalizationLayer::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_border_handler, Window::DimY);
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    _multiply_f.run();
+    NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
deleted file mode 100644
index cb10ca8508..0000000000
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _func_scharr(),
-      _kernel_tracker(),
-      _scharr_gx(),
-      _scharr_gy(),
-      _new_points(nullptr),
-      _new_points_estimates(nullptr),
-      _old_points(nullptr),
-      _new_points_internal(),
-      _old_points_internal(),
-      _num_levels(0)
-{
-}
-
-void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates,
-                              IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension,
-                              bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
-    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
-    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
-    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
-    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
-
-    _num_levels           = old_pyramid->info()->num_levels();
-    _old_points           = old_points;
-    _new_points           = new_points;
-    _new_points_estimates = new_points_estimates;
-
-    const float pyr_scale = old_pyramid->info()->scale();
-
-    _func_scharr.clear();
-    _kernel_tracker.clear();
-    _scharr_gx.clear();
-    _scharr_gy.clear();
-
-    _func_scharr.resize(_num_levels);
-    _kernel_tracker.resize(_num_levels);
-    _scharr_gx.resize(_num_levels);
-    _scharr_gy.resize(_num_levels);
-
-    _old_points_internal = LKInternalKeypointArray(old_points->num_values());
-    _new_points_internal = LKInternalKeypointArray(old_points->num_values());
-    _new_points->resize(old_points->num_values());
-
-    for(unsigned int i = 0; i < _num_levels; ++i)
-    {
-        // Get images from the ith level of old and right pyramid
-        IImage *old_ith_input = old_pyramid->get_pyramid_level(i);
-        IImage *new_ith_input = new_pyramid->get_pyramid_level(i);
-
-        // Get width and height of images
-        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
-        const unsigned int height_ith = new_ith_input->info()->dimension(1);
-
-        TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
-
-        _scharr_gx[i].allocator()->init(tensor_info);
-        _scharr_gy[i].allocator()->init(tensor_info);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_scharr_gx[i]);
-        _memory_group.manage(&_scharr_gy[i]);
-
-        // Init Scharr kernel
-        _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
-
-        // Init Lucas-Kanade kernel
-        _kernel_tracker[i].configure(old_ith_input, new_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                     old_points, new_points_estimates, new_points,
-                                     &_old_points_internal, &_new_points_internal,
-                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
-                                     i, _num_levels, pyr_scale);
-
-        _scharr_gx[i].allocator()->allocate();
-        _scharr_gy[i].allocator()->allocate();
-    }
-}
-
-void NEOpticalFlow::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    for(unsigned int level = _num_levels; level > 0; --level)
-    {
-        // Run Scharr kernel
-        _func_scharr[level - 1].run();
-
-        // Run Lucas-Kanade kernel
-        NEScheduler::get().schedule(&_kernel_tracker[level - 1], Window::DimX);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index 02dfc6f137..963e68bac7 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,48 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/cpu/operators/CpuPRelu.h"
 
 namespace arm_compute
 {
+using OperatorType = cpu::CpuPRelu;
+
+struct NEPReluLayer::Impl
+{
+    const ITensor                *src_0{nullptr};
+    const ITensor                *src_1{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
+};
+
+NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>())
+{
+}
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&)            = default;
+NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
+NEPReluLayer::~NEPReluLayer()                          = default;
+
 void NEPReluLayer::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::PRELU, input, alpha, output);
-    _kernel = std::move(k);
+    _impl->src_0 = input;
+    _impl->src_1 = alpha;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<OperatorType>();
+    _impl->op->configure(input->info(), alpha->info(), output->info());
+}
+
+void NEPReluLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 
 Status NEPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
 {
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
+    return OperatorType::validate(input, alpha, output);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 537eba7fdf..253566df0f 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
 
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 
 namespace arm_compute
 {
@@ -35,9 +38,9 @@ namespace
 uint32_t last_padding_dimension(const PaddingList &padding)
 {
     int last_padding_dim = padding.size() - 1;
-    for(; last_padding_dim >= 0; --last_padding_dim)
+    for (; last_padding_dim >= 0; --last_padding_dim)
     {
-        if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+        if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
         {
             break;
         }
@@ -46,14 +49,28 @@ uint32_t last_padding_dimension(const PaddingList &padding)
 }
 } // namespace
 
+NEPadLayer::~NEPadLayer() = default;
+
 NEPadLayer::NEPadLayer()
-    : _copy_kernel(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+    : _copy_function(),
+      _pad_kernel(),
+      _mode(),
+      _padding(),
+      _num_dimensions(0),
+      _slice_functions(),
+      _concat_functions(),
+      _slice_results(),
+      _concat_results()
 {
 }
 
-void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor           *input,
+                                         ITensor           *output,
+                                         const PaddingList &padding,
+                                         const PixelValue   constant_value)
 {
-    _pad_kernel.configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
+    _pad_kernel = std::make_unique<NEPadLayerKernel>();
+    _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
 }
 
 void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
@@ -79,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
     Coordinates ends_after{};
     Coordinates strides{};
     ITensor    *prev = input;
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    for (uint32_t i = 0; i < _num_dimensions; ++i)
     {
         // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
-        if(i > 0)
+        if (i > 0)
         {
             strides.set(i - 1, 1);
         }
 
-        if(_padding[i].first > 0 || _padding[i].second > 0)
+        if (_padding[i].first > 0 || _padding[i].second > 0)
         {
             // Set the starts, ends, and strides values for the current dimension.
             // Due to the bit masks passed to strided slice, the values below the current dimension in
             // starts and ends will be ignored so do not need to be modified.
-            if(_mode == PaddingMode::REFLECT)
+            if (_mode == PaddingMode::REFLECT)
             {
                 starts_before.set(i, _padding[i].first);
                 ends_before.set(i, 0);
@@ -117,12 +134,13 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
             const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
 
             // Reflect the input values for the padding before and after the input.
-            std::vector<ITensor *> concat_vector;
-            if(_padding[i].first > 0)
+            std::vector<const ITensor *> concat_vector;
+            if (_padding[i].first > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides,
+                                                      begin_mask_before, end_mask_before);
                     concat_vector.emplace_back(&_slice_results[2 * i]);
                 }
                 else
@@ -132,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
                 }
             }
             concat_vector.push_back(prev);
-            if(_padding[i].second > 0)
+            if (_padding[i].second > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after,
+                                                          strides, begin_mask_after, end_mask_after);
                     concat_vector.emplace_back(&_slice_results[2 * i + 1]);
                 }
                 else
@@ -147,8 +166,13 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
             }
             // Concatenate the padding before and after with the input.
             ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+            out->info()->set_quantization_info(output->info()->quantization_info());
+            for (auto &v : concat_vector)
+            {
+                v->info()->set_quantization_info(input->info()->quantization_info());
+            }
             _concat_functions[i].configure(concat_vector, out, i);
-            if(i != _num_dimensions - 1)
+            if (i != _num_dimensions - 1)
             {
                 _concat_results[i].allocator()->allocate();
             }
@@ -159,22 +183,28 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
     }
 }
 
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayer::configure(ITensor           *input,
+                           ITensor           *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value,
+                           const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+    ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
 
     _padding = padding;
     _mode    = mode;
 
-    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+    const TensorShape padded_shape =
+        misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
 
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
 
     // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
     _num_dimensions = last_padding_dimension(padding) + 1;
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
@@ -194,23 +224,27 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel.configure(input, output);
+        _copy_function.configure(input, output);
     }
 }
 
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            const PixelValue   constant_value,
+                            const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
 
     const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
@@ -219,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
         case PaddingMode::REFLECT:
         case PaddingMode::SYMMETRIC:
         {
-            for(uint32_t i = 0; i < padding.size(); ++i)
+            for (uint32_t i = 0; i < padding.size(); ++i)
             {
-                if(mode == PaddingMode::REFLECT)
+                if (mode == PaddingMode::REFLECT)
                 {
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
@@ -244,27 +278,27 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
 
 void NEPadLayer::run()
 {
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
-                NEScheduler::get().schedule(&_pad_kernel, Window::DimZ);
+                NEScheduler::get().schedule(_pad_kernel.get(), Window::DimZ);
                 break;
             }
             case PaddingMode::REFLECT:
             case PaddingMode::SYMMETRIC:
             {
-                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                for (uint32_t i = 0; i < _num_dimensions; ++i)
                 {
-                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    if (_padding[i].first > 0 || _padding[i].second > 0)
                     {
-                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i].run();
                         }
-                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i + 1].run();
                         }
@@ -279,7 +313,7 @@ void NEPadLayer::run()
     }
     else
     {
-        NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+        _copy_function.run();
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index cfd27da117..80cd04ce6c 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,20 +23,48 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuPermute.h"
 
 namespace arm_compute
 {
+struct NEPermute::Impl
+{
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuPermute> op{nullptr};
+};
+
+NEPermute::NEPermute() : _impl(std::make_unique<Impl>())
+{
+}
+
+NEPermute::~NEPermute() = default;
+
 void NEPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEPermuteKernel>();
-    k->configure(input, output, perm);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuPermute>();
+    _impl->op->configure(input->info(), output->info(), perm);
 }
 
 Status NEPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
-    return NEPermuteKernel::validate(input, output, perm);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuPermute::validate(input, output, perm));
+
+    return Status{};
+}
+
+void NEPermute::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
deleted file mode 100644
index bb96f6db5c..0000000000
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEPhase.h"
-
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type)
-{
-    if(phase_type == PhaseType::UNSIGNED)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
-        k->configure(input1, input2, nullptr, output);
-        _kernel = std::move(k);
-    }
-    else
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-        k->configure(input1, input2, nullptr, output);
-        _kernel = std::move(k);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index eaf233b9ed..97155a9e74 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,60 +24,101 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "support/MemorySupport.h"
+
+#include "src/cpu/operators/CpuMul.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
-                                          const ActivationLayerInfo &act_info)
+struct NEPixelWiseMultiplication::Impl
 {
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
-    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
-    _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuMul> op{nullptr};
+};
 
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
+NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
+{
 }
-Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
+
+Status NEPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
                                            const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
+    return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEPixelWiseMultiplication::configure(const ITensor             *input1,
+                                          const ITensor             *input2,
+                                          ITensor                   *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuMul>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy,
+                         act_info);
+}
 
-    if(output->info()->dimension(0) > 1)
-    {
-        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+void NEPixelWiseMultiplication::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
+struct NEComplexPixelWiseMultiplication::Impl
+{
+    ITensor                            *src_0{nullptr};
+    ITensor                            *src_1{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuComplexMul> op{nullptr};
+};
+
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
+{
+}
+NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
+{
+    return cpu::CpuComplexMul::validate(input1, input2, output, act_info);
 }
 
-Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+void NEComplexPixelWiseMultiplication::configure(ITensor                   *input1,
+                                                 ITensor                   *input2,
+                                                 ITensor                   *output,
+                                                 const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = std::make_unique<cpu::CpuComplexMul>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
 }
 
+void NEComplexPixelWiseMultiplication::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
new file mode 100644
index 0000000000..e017e8c21d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuPool3d.h"
+
+namespace arm_compute
+{
+struct NEPooling3dLayer::Impl
+{
+    const ITensor                  *src{nullptr};
+    ITensor                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuPool3d> op{nullptr};
+    MemoryGroup                     memory_group{};
+    ITensorPack                     run_pack{};
+    WorkspaceData<Tensor>           workspace_tensors{};
+};
+
+NEPooling3dLayer::~NEPooling3dLayer() = default;
+
+NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+    _impl->memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuPool3d>();
+    _impl->op->configure(input->info(), output->info(), pool_info);
+
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}};
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status
+NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+{
+    return cpu::CpuPool3d::validate(input, output, pool_info);
+}
+
+void NEPooling3dLayer::run()
+{
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 12921cf40e..eb9125be3c 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,69 +23,59 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
 
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Tensor.h"
 
-using namespace arm_compute;
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuPool2d.h"
 
-NEPoolingLayer::NEPoolingLayer()
-    : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
+namespace arm_compute
 {
+struct NEPoolingLayer::Impl
+{
+    ITensor                        *src{nullptr};
+    ITensor                        *dst{nullptr};
+    ITensor                        *indices{nullptr};
+    std::unique_ptr<cpu::CpuPool2d> op{nullptr};
+    MemoryGroup                     memory_group{};
+    ITensorPack                     run_pack{};
+    WorkspaceData<Tensor>           workspace_tensors{};
+};
+
+NEPoolingLayer::~NEPoolingLayer() = default;
+
+NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+    _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
 
 void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices)
 {
-    // Check if we have Global Pooling Layer
-    _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size.width) && (input->info()->dimension(1) == pool_info.pool_size.height);
+    _impl->src     = input;
+    _impl->dst     = output;
+    _impl->indices = indices;
+    _impl->op      = std::make_unique<cpu::CpuPool2d>();
+    _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 
-    // Get data layout
-    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
-
-    // Configure pooling kernel
-    _pooling_layer_kernel.configure(input, output, pool_info, indices);
-
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-            BorderMode border_mode = (pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-            PixelValue zero_value(0.f);
-            if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding)
-            {
-                zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
-            }
-            _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
-            break;
-        }
-        case DataLayout::NHWC:
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src},
+                                {TensorType::ACL_DST_0, _impl->dst},
+                                {TensorType::ACL_DST_1, _impl->indices}};
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status NEPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
-    return NEPoolingLayerKernel::validate(input, output, pool_info, indices);
+    return cpu::CpuPool2d::validate(input, output, pool_info, indices);
 }
 
 void NEPoolingLayer::run()
 {
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-            // Fill border
-            NEScheduler::get().schedule(&_border_handler, Window::DimY);
-
-            // Run pooling layer
-            NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
-            break;
-        case DataLayout::NHWC:
-            // Run pooling layer
-            NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    _impl->op->run(_impl->run_pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index 6e7d4ab667..dbb6bf9df1 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,20 +27,31 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+
 namespace arm_compute
 {
-void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayer::configure(const ITensor           *input1,
+                                const ITensor           *input2,
+                                ITensor                 *output,
+                                const PriorBoxLayerInfo &info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEPriorBoxLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
+
+    auto k = std::make_unique<NEPriorBoxLayerKernel>();
     k->configure(input1, input2, output, info);
     _kernel = std::move(k);
 }
 
-Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
 }
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 083e3fddb4..dd78d10d16 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,29 +23,64 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
 
+#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
+
 namespace arm_compute
 {
 using namespace arm_compute::utils::info_helpers;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
 
+Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
+{
+    // Output quantization scale will be different, but ignored here
+    // since it will be configured at configure() stage.
+    const TensorInfo out{in};
+    return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
+}
+
+void NEQLSTMLayer::configure_layer_norm(NEQLSTMLayer::LayerNormGate g, const ITensor *in)
+{
+    ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
+
+    Tensor &out = get_layer_norm_output(g);
+    _memory_group.manage(&out);
+    out.allocator()->init(*(in->info()));
+
+    get_layer_norm(g) = std::make_unique<NEQLSTMLayerNormalizationKernel>();
+    get_layer_norm(g)->configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
+}
+
+NEQLSTMLayer::TensorCopyKernel::~TensorCopyKernel() = default;
+
 Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(src.tensor_shape().num_dimensions() > max_dimension_supported);
@@ -58,6 +93,8 @@ Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const IT
 void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::TensorCopyKernel::validate(*src.info(), *dst.info()));
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+
     _src      = &src;
     _dst      = &dst;
     _row_size = std::min(_src->info()->tensor_shape().x(), _dst->info()->tensor_shape().x());
@@ -66,25 +103,108 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
 
 void NEQLSTMLayer::TensorCopyKernel::run()
 {
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 }
 
+NEQLSTMLayer::~NEQLSTMLayer() = default;
+
 NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(),
+      _dequantize_input_to_forget_weights(),
+      _quantize_input_to_forget_weights(),
+      _transpose_input_to_forget_weights(),
+      _transpose_input_to_cell_weights(),
+      _transpose_input_to_output_weights(),
+      _transpose_input_to_input_weights(),
+      _transpose_recurrent_to_forget_weights(),
+      _transpose_recurrent_to_cell_weights(),
+      _transpose_recurrent_to_output_weights(),
+      _transpose_recurrent_to_input_weights(),
+      _transpose_projection_weights(),
+      _input_to_input_reduction(),
+      _recurrent_to_input_reduction(),
+      _input_to_forget_reduction(),
+      _recurrent_to_forget_reduction(),
+      _input_to_cell_reduction(),
+      _recurrent_to_cell_reduction(),
+      _input_to_output_reduction(),
+      _recurrent_to_output_reduction(),
+      _projection_reduction(),
+      _projection_bias_add(),
+      _mm_input_to_forget(),
+      _mm_recurrent_to_forget(),
+      _pixelwise_mul_cell_to_forget(),
+      _input_to_forget_outstage(),
+      _recurrent_to_forget_outstage(),
+      _cell_to_forget_outstage(),
+      _accumulate_input_recurrent_forget(),
+      _accumulate_cell_forget(),
+      _forget_gate_sigmoid(),
+      _mm_input_to_cell(),
+      _input_to_cell_outstage(),
+      _mm_recurrent_to_cell(),
+      _recurrent_to_cell_outstage(),
+      _accumulate_input_recurrent_modulation(),
+      _cell_gate_tanh(),
+      _input_gate_sub(),
+      _mm_input_to_input(),
+      _input_to_input_outstage(),
+      _mm_recurrent_to_input(),
+      _recurrent_to_input_outstage(),
+      _accumulate_input_recurrent_input(),
+      _pixelwise_mul_cell_to_input(),
+      _cell_to_input_outstage(),
+      _accumulate_cell_input(),
+      _input_gate_sigmoid(),
+      _pixelwise_mul_forget_cell(),
+      _pixelwise_mul_input_cell(),
+      _add_forget_cell(),
+      _cell_clip(),
+      _mm_input_to_output(),
+      _input_to_output_outstage(),
+      _mm_recurrent_to_output(),
+      _recurrent_to_output_outstage(),
+      _accumulate_input_recurrent_output(),
+      _pixelwise_mul_cell_to_output(),
+      _cell_to_output_outstage(),
+      _accumulate_cell_to_output(),
+      _output_gate_sigmoid(),
+      _hidden_tanh(),
+      _pixelwise_mul_hidden(),
+      _hidden_outstage(),
+      _mm_projection(),
+      _projection_outstage(),
+      _accumulate_projection(),
+      _projection_clip(),
+      _projection_bias_copy(),
+      _projection_output_to_accumulate_copy(),
+      _projection_accumulate_to_output_copy(),
+      _hidden_to_output_copy(),
+      _layer_norms(),
+      _copy_output(),
+      _layer_norm_weights(),
+      _layer_norm_bias(),
+      _layer_norm_output()
 {
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
-void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias,
-                                Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                                NEGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ITensor                *mm_input,
+                                const ITensor                *mm_weights,
+                                const ITensor                *bias,
+                                Tensor                       *mm_res,
+                                Tensor                       *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -96,33 +216,88 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp
     mm.configure(mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void NEQLSTMLayer::configure(const ITensor *input,
-                             const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                             const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                             const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                             const ITensor *cell_state_in, const ITensor *output_state_in,
-                             ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+void NEQLSTMLayer::configure(const ITensor             *input,
+                             const ITensor             *input_to_forget_weights,
+                             const ITensor             *input_to_cell_weights,
+                             const ITensor             *input_to_output_weights,
+                             const ITensor             *recurrent_to_forget_weights,
+                             const ITensor             *recurrent_to_cell_weights,
+                             const ITensor             *recurrent_to_output_weights,
+                             const ITensor             *forget_gate_bias,
+                             const ITensor             *cell_bias,
+                             const ITensor             *output_gate_bias,
+                             const ITensor             *cell_state_in,
+                             ITensor                   *output_state_in,
+                             ITensor                   *cell_state_out,
+                             ITensor                   *output_state_out,
+                             ITensor                   *output,
                              const LSTMParams<ITensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
+
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                           cell_state_out, output_state_out);
 
     // Set lstm parameters
     LSTMParams<ITensorInfo> lstm_params_info{};
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
-    // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                      cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                      lstm_params_info));
+    _input_to_forget_weights_transposed.info()->set_quantization_info(
+        input_to_forget_weights->info()->quantization_info());
+    _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info());
+    _input_to_output_weights_transposed.info()->set_quantization_info(
+        input_to_output_weights->info()->quantization_info());
+    _recurrent_to_forget_weights_transposed.info()->set_quantization_info(
+        recurrent_to_forget_weights->info()->quantization_info());
+    _recurrent_to_cell_weights_transposed.info()->set_quantization_info(
+        recurrent_to_cell_weights->info()->quantization_info());
+    _recurrent_to_output_weights_transposed.info()->set_quantization_info(
+        recurrent_to_output_weights->info()->quantization_info());
+
+    if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+    {
+        _convert_input_to_forget_weights_to_qsymm8 = true;
+        // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32
+
+        _input_to_forget_weights_f32.allocator()->init(
+            TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
+                .set_data_layout(input_to_forget_weights->info()->data_layout()));
+        // Setup the quantize output tensor to go from F32 -> QSYMM8
+        _input_to_forget_weights_symm8.allocator()->init(
+            (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
+                 .set_data_layout(input_to_forget_weights->info()->data_layout())
+                 .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
+
+        _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32);
+        _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8);
+
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
+    }
 
     const int batch_size  = input->info()->dimension(1);
     const int num_units   = input_to_output_weights->info()->dimension(1);
@@ -133,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input,
     const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
 
     _projection_bias             = lstm_params.projection_bias();
-    _input_to_forget_weights     = input_to_forget_weights;
+    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+                                       ? &_input_to_forget_weights_symm8
+                                       : input_to_forget_weights;
     _input_to_cell_weights       = input_to_cell_weights;
     _input_to_output_weights     = input_to_output_weights;
     _recurrent_to_forget_weights = recurrent_to_forget_weights;
@@ -143,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -165,45 +342,79 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction.configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction.configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+        _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction.configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction.configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction.configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction.configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction.configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction.configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    if(_has_projection)
+
+    _input_to_forget_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _recurrent_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _input_to_cell_reduction       = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _recurrent_to_cell_reduction   = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _input_to_output_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+
+    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
-        _projection_reduction.configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+        _projection_reduction->configure(
+            _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
+        {
+            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias,
+                                           ConvertPolicy::SATURATE);
+        }
     }
 
     // Pre-transpose weights to be used in GEMM.
     _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
     _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
     _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
     _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
     }
@@ -216,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
-                 &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res,
+                 &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res,
+                                          &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, forget_activation_input);
         forget_activation_input->allocator()->allocate();
@@ -258,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed,
+                 &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
                  mm_out_info, cell_outstage_info);
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, cell_activation_input);
         cell_activation_input->allocator()->allocate();
@@ -295,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -310,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input,
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
-
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
-                     output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in,
+                     &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res,
+                                              gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         Tensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, input_activation_input);
             input_activation_input->allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
-    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
-    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
+    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE,
+                                        RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
     _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
-                 &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res,
+                 &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info);
+
+    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res,
+                                                 &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
-        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res,
+                                             &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     Tensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, output_activation_input);
         output_activation_input->allocator()->allocate();
@@ -417,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
-    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+    _hidden_tanh.configure(cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE,
+                                    RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -439,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -450,59 +714,62 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
-
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
+
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ITensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
-            _projection_accumulate_res.allocator()->init(*output_state_out->info());
+            _projection_accumulate_res.allocator()->init(*output_state_in->info());
             _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape());
-            _projection_output_to_accumulate_copy.configure(*output_state_out, _projection_accumulate_res);
+            _projection_output_to_accumulate_copy.configure(*output_state_in, _projection_accumulate_res);
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination,
+                                         ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -513,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _copy_output.configure(output_state_out, output);
 }
 
-Status NEQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status NEQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -535,14 +812,28 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
+    // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
+    if (input_to_forget_weights->data_type() == DataType::QSYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights,
+                                                           recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                           recurrent_to_output_weights);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                           input_to_output_weights, recurrent_to_forget_weights,
+                                                           recurrent_to_cell_weights, recurrent_to_output_weights);
+    }
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);
@@ -560,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -587,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -595,44 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                               lstm_params.hidden_state_zero(),
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
+        }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(),
+                                              input_to_cell_weights->quantization_info());
+    const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1,
+                                                        input_to_output_weights->data_type(),
+                                                        input_to_output_weights->quantization_info());
+    const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_forget_weights->data_type(),
+                                                            recurrent_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                          recurrent_to_cell_weights->data_type(),
+                                                          recurrent_to_cell_weights->quantization_info());
+    const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_output_weights->data_type(),
+                                                            recurrent_to_output_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
-    // Validate weights transpose
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_forget_weights, &input_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+        const TensorInfo recurrent_to_input_weights_transposed(
+            TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(),
+            lstm_params.recurrent_to_input_weights()->quantization_info());
+        const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1,
+                                                           lstm_params.input_to_input_weights()->data_type(),
+                                                           lstm_params.input_to_input_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -645,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -675,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
-    if(has_layer_norm)
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
+
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -698,85 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     }
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+
+        // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
+        if (input_to_forget_weights->data_type() == DataType::QSYMM8)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights,
+                                                               lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
+        }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
-        if(lstm_params.has_peephole_opt())
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
+
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                                  RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&input_outstage_info, &input_gate_info,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
-        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
+        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -784,91 +1198,109 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
-    gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
+    gemmlowp_info.output_data_type = hidden_out_info.data_type();
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
     return Status{};
 }
 
@@ -885,18 +1317,18 @@ void NEQLSTMLayer::run()
 
     _mm_recurrent_to_forget.run();
     _recurrent_to_forget_outstage.run();
-    NEScheduler::get().schedule(&_accumulate_input_recurrent_forget, Window::DimY);
+    _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY);
+        _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
-        NEScheduler::get().schedule(&_accumulate_cell_forget, Window::DimY);
+        _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Forget), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
     }
 
     _forget_gate_sigmoid.run();
@@ -907,19 +1339,19 @@ void NEQLSTMLayer::run()
 
     _mm_recurrent_to_cell.run();
     _recurrent_to_cell_outstage.run();
-    NEScheduler::get().schedule(&_accumulate_input_recurrent_modulation, Window::DimY);
+    _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Cell), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
     }
 
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
-        NEScheduler::get().schedule(&_input_gate_sub, Window::DimY);
+        _input_gate_sub.run();
     }
     else
     {
@@ -927,28 +1359,29 @@ void NEQLSTMLayer::run()
         _input_to_input_outstage.run();
         _mm_recurrent_to_input.run();
         _recurrent_to_input_outstage.run();
-        NEScheduler::get().schedule(&_accumulate_input_recurrent_input, Window::DimY);
+        _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY);
+            _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
-            NEScheduler::get().schedule(&_accumulate_cell_input, Window::DimY);
+            _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
-            NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Input), Window::DimY);
+            NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
         }
 
         _input_gate_sigmoid.run();
     }
 
     // Cell.
-    NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY);
-    NEScheduler::get().schedule(&_add_forget_cell, Window::DimY);
-    if(_has_cell_clipping)
+    _pixelwise_mul_forget_cell.run();
+    _pixelwise_mul_input_cell.run();
+    _add_forget_cell.run();
+
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -958,65 +1391,73 @@ void NEQLSTMLayer::run()
     _input_to_output_outstage.run();
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
-    NEScheduler::get().schedule(&_accumulate_input_recurrent_output, Window::DimY);
-    if(_has_peephole)
+    _accumulate_input_recurrent_output.run();
+    if (_has_peephole)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY);
+        _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
-        NEScheduler::get().schedule(&_accumulate_cell_to_output, Window::DimY);
+        _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Output), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
     }
 
     _output_gate_sigmoid.run();
 
     // Hidden.
     _hidden_tanh.run();
-    NEScheduler::get().schedule(&_pixelwise_mul_hidden, Window::DimY);
+    _pixelwise_mul_hidden.run();
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
-        NEScheduler::get().schedule(&_accumulate_projection, Window::DimY);
+        _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
     }
 
     // Copy output_state_out to output
-    NEScheduler::get().schedule(&_copy_output, Window::DimY);
+    _copy_output.run();
 }
 
 void NEQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
+        if (_convert_input_to_forget_weights_to_qsymm8)
+        {
+            _input_to_forget_weights_f32.allocator()->allocate();
+            _input_to_forget_weights_symm8.allocator()->allocate();
+            _dequantize_input_to_forget_weights.run();
+            _quantize_input_to_forget_weights.run();
+        }
+
         // Pre-transpose weights to be used in GEMM.
         _input_to_forget_weights_transposed.allocator()->allocate();
         _input_to_cell_weights_transposed.allocator()->allocate();
@@ -1032,16 +1473,25 @@ void NEQLSTMLayer::prepare()
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
         }
         else
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            NEScheduler::get().schedule(&_input_to_input_reduction, Window::DimY);
-            NEScheduler::get().schedule(&_recurrent_to_input_reduction, Window::DimY);
+
+            ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights},
+                                  {TensorType::ACL_DST, &_input_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY,
+                                           _input_to_input_reduction->window(), packII);
+
+            ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights},
+                                  {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY,
+                                           _recurrent_to_input_reduction->window(), packRI);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1056,19 +1506,47 @@ void NEQLSTMLayer::prepare()
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        NEScheduler::get().schedule(&_input_to_forget_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_forget_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_input_to_cell_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_cell_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_input_to_output_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_output_reduction, Window::DimY);
-
-        if(_has_projection)
+
+        ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights},
+                              {TensorType::ACL_DST, &_input_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY,
+                                       _input_to_forget_reduction->window(), packIF);
+
+        ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY,
+                                       _recurrent_to_forget_reduction->window(), packRF);
+
+        ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights},
+                              {TensorType::ACL_DST, &_input_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(),
+                                       packIC);
+
+        ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY,
+                                       _recurrent_to_cell_reduction->window(), packRC);
+
+        ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights},
+                              {TensorType::ACL_DST, &_input_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY,
+                                       _input_to_output_reduction->window(), packIO);
+
+        ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY,
+                                       _recurrent_to_output_reduction->window(), packRO);
+
+        if (_has_projection)
         {
-            if(_projection_bias != nullptr)
+            _projection_eff_bias.allocator()->allocate();
+            ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights},
+                                {TensorType::ACL_DST, &_projection_eff_bias}};
+            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(),
+                                           pack);
+            if (_projection_bias != nullptr)
             {
-                _projection_eff_bias.allocator()->allocate();
-                NEScheduler::get().schedule(&_projection_reduction, Window::DimY);
+                _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
             }
 
@@ -1076,7 +1554,7 @@ void NEQLSTMLayer::prepare()
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();
@@ -1094,5 +1572,4 @@ void NEQLSTMLayer::prepare()
         _is_prepared = true;
     }
 }
-
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 47cc8b05c1..9b72783c97 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,43 @@
 
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/cpu/operators/CpuQuantize.h"
 
 namespace arm_compute
 {
-Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+struct NEQuantizationLayer::Impl
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output));
+    const ITensor                    *src{nullptr};
+    ITensor                          *dst{nullptr};
+    std::unique_ptr<cpu::CpuQuantize> op{nullptr};
+};
 
-    return Status{};
+NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>())
+{
+}
+NEQuantizationLayer::~NEQuantizationLayer() = default;
+
+Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return cpu::CpuQuantize::validate(input, output);
 }
 
 void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuQuantize>();
+    _impl->op->configure(input->info(), output->info());
+}
 
-    // Configure quantize kernel
-    auto k = arm_compute::support::cpp14::make_unique<NEQuantizationLayerKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+void NEQuantizationLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 154b060c3d..2824693800 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,22 +27,40 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
+NERNNLayer::~NERNNLayer() = default;
+
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(),
-      _add_output(), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_f(),
+      _activation(),
+      _fully_connected(memory_manager),
+      _copy_f(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
+      _is_prepared(false)
 {
 }
 
-Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status NERNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
     const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
@@ -56,23 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+                   input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+void NERNNLayer::configure(const ITensor       *input,
+                           const ITensor       *weights,
+                           const ITensor       *recurrent_weights,
+                           const ITensor       *bias,
+                           ITensor             *hidden_state,
+                           ITensor             *output,
                            ActivationLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(),
+                                                                       hidden_state->info()->dimension(idx_height));
 
     _is_prepared = false;
 
@@ -90,15 +119,15 @@ void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const I
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_f.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
-    _activation_kernel.configure(&_add_output, hidden_state, info);
+    _activation.configure(&_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel.configure(hidden_state, output);
+    _copy_f.configure(hidden_state, output);
 }
 
 void NERNNLayer::run()
@@ -111,16 +140,16 @@ void NERNNLayer::run()
 
     _gemm_state_f.run();
 
-    NEScheduler::get().schedule(&_add_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+    _add_f.run();
+    _activation.run();
 
     // copy hidden out to output
-    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+    _copy_f.run();
 }
 
 void NERNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected.prepare();
         _gemm_state_f.prepare();
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index 2299bf78a5..68bb5d5ef3 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,22 +23,31 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
 
 namespace arm_compute
 {
-Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayer::configure(const ITensor             *input,
+                                const ITensor             *rois,
+                                ITensor                   *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
     // Configure ROI pooling kernel
-    auto k = arm_compute::support::cpp14::make_unique<NEROIAlignLayerKernel>();
+    auto k = std::make_unique<NEROIAlignLayerKernel>();
     k->configure(input, rois, output, pool_info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 3aca4b7b60..babec4aa92 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,40 @@
 #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
 namespace arm_compute
 {
-NEROIPoolingLayer::NEROIPoolingLayer()
-    : _roi_kernel()
+NEROIPoolingLayer::~NEROIPoolingLayer() = default;
+
+NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel()
+{
+}
+
+Status NEROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   const ITensorInfo         *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
+    return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor             *input,
+                                  const ITensor             *rois,
+                                  const ITensor             *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
-    _roi_kernel.configure(input, rois, output, pool_info);
+    ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
+    _roi_kernel = std::make_unique<NEROIPoolingLayerKernel>();
+    _roi_kernel->configure(input, rois, output, pool_info);
 }
 
 void NEROIPoolingLayer::run()
 {
-    NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
+    NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index 977d502286..95492df126 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,16 +25,22 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NERangeKernel.h"
+
 namespace arm_compute
 {
-NERange::NERange()
-    : _kernel()
+NERange::~NERange() = default;
+
+NERange::NERange() : _kernel()
 {
 }
 
 void NERange::configure(ITensor *output, const float start, const float end, const float step)
 {
-    _kernel.configure(output, start, end, step);
+    ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
+    _kernel = std::make_unique<NERangeKernel>();
+    _kernel->configure(output, start, end, step);
 }
 
 Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step)
@@ -44,6 +50,6 @@ Status NERange::validate(const ITensorInfo *output, const float start, const flo
 
 void NERange::run()
 {
-    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    NEScheduler::get().schedule(_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index d53ed31645..a23db87059 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,29 +23,26 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-} // namespace
-
-NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
-{
-}
-
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -53,29 +50,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -84,27 +88,45 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
                 ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
                 const unsigned int remove_index = axis_local[i] - i;
                 ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
-                out_shape.remove_dimension(remove_index);
+                out_shape.remove_dimension(remove_index, false);
             }
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
     return Status{};
 }
+} // namespace
+
+NEReduceMean::~NEReduceMean() = default;
 
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _reduction_ops(),
+      _keep_dims()
+{
+}
+
+Status NEReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
 
 void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
 {
+    ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
+
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
@@ -112,61 +134,72 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims = keep_dims;
 
+    ITensor *tmp_input  = input;
+    ITensor *tmp_output = output;
+
     Coordinates axis_local = reduction_axis;
-    const int   input_dims = input->info()->num_dimensions();
+    const int   input_dims = tmp_input->info()->num_dimensions();
 
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+        auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(),
+                                                          tmp_output->info()->data_type(),
+                                                          tmp_output->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
             _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
-
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
-        TensorShape out_shape = input->info()->tensor_shape();
+        TensorShape out_shape = tmp_input->info()->tensor_shape();
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+        for (int i = 0; i < _reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_local[i] - i);
+            out_shape.remove_dimension(axis_local[i] - i, false);
         }
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+        auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
+        _reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output);
     }
 }
 
 void NEReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 80ebe6731a..8540d750fc 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+
 namespace arm_compute
 {
 namespace
@@ -39,7 +43,7 @@ namespace
  */
 size_t reduction_window_split_dimension(unsigned int axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 0:
             return Window::DimY;
@@ -53,14 +57,24 @@ size_t reduction_window_split_dimension(unsigned int axis)
 }
 } // namespace
 
+NEReductionOperation::~NEReductionOperation() = default;
+
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager),
+      _reduction_kernel(),
+      _reshape(),
+      _output_internal(),
+      _window_split(0),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status NEReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const auto is_reshape_required = !keep_dims;
@@ -69,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
     TensorInfo info_before_reshape;
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
 
         auto shape_before_reshape = input->tensor_shape();
@@ -79,113 +94,88 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
         const auto input_num_channles = input->num_channels();
         const auto input_qinfo        = input->quantization_info();
-        const auto is_arg_min_max     = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
-        const auto output_data_type   = is_arg_min_max ? DataType::S32 : output->data_type();
+        const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
 
-        info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+        info_before_reshape.set_data_type(output_data_type)
+            .set_tensor_shape(shape_before_reshape)
+            .set_num_channels(input_num_channles)
+            .set_quantization_info(input_qinfo);
 
         output_internal = &info_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
     }
 
     return Status{};
 }
 
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void NEReductionOperation::configure(
+    ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
 
     _is_reshape_required = !keep_dims;
 
     auto      *output_internal = output;
     const auto is_arg_min_max  = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
-        const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto output_data_type      = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-        const auto num_channels          = input->info()->num_channels();
-        const auto qinfo                 = input->info()->quantization_info();
-
-        _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
-                                               num_channels).set_quantization_info(qinfo));
+        const auto output_internal_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+        const auto output_external_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+        const auto num_channels     = input->info()->num_channels();
+        const auto qinfo            = input->info()->quantization_info();
+
+        _output_internal.allocator()->init(input->info()
+                                               ->clone()
+                                               ->set_data_type(output_data_type)
+                                               .set_tensor_shape(output_internal_shape)
+                                               .reset_padding()
+                                               .set_is_resizable(true)
+                                               .set_num_channels(num_channels)
+                                               .set_quantization_info(qinfo));
         _memory_group.manage(&_output_internal);
         output_internal = &_output_internal;
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_data_type(output_data_type)
+                                                .set_tensor_shape(output_external_shape)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
 
     // Configure reduction kernel
-    _reduction_kernel.configure(input, output_internal, axis, op);
+    _reduction_kernel = std::make_unique<NEReductionOperationKernel>();
+    _reduction_kernel->configure(input, output_internal, axis, op);
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
-    if(axis == 0)
+    if (_is_reshape_required)
     {
-        // Configure fill border kernel
-        const BorderSize fill_border_size = _reduction_kernel.border_size();
-        PixelValue       pixelValue;
-        switch(op)
-        {
-            case ReductionOperation::PROD:
-            {
-                pixelValue = PixelValue(1, input->info()->data_type(), input->info()->quantization_info());
-                break;
-            }
-            case ReductionOperation::MIN:
-            {
-                pixelValue = std::get<1>(get_min_max(input->info()->data_type()));
-                break;
-            }
-            case ReductionOperation::MAX:
-            {
-                pixelValue = std::get<0>(get_min_max(input->info()->data_type()));
-                break;
-            }
-            case ReductionOperation::ARG_IDX_MAX:
-            case ReductionOperation::ARG_IDX_MIN:
-            {
-                pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
-                break;
-            }
-            case ReductionOperation::MEAN_SUM:
-            case ReductionOperation::SUM_SQUARE:
-            case ReductionOperation::SUM:
-            {
-                pixelValue = PixelValue(static_cast<uint32_t>(0));
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Reduction Operation unsupported");
-        }
-        _fill_border_kernel.configure(input, fill_border_size, (is_arg_min_max ? BorderMode::REPLICATE : BorderMode::CONSTANT), pixelValue);
-    }
-
-    if(_is_reshape_required)
-    {
-        _reshape_kernel.configure(output_internal, output);
+        _reshape.configure(output_internal, output);
         _output_internal.allocator()->allocate();
     }
 }
 
 void NEReductionOperation::run()
 {
-    if(_reduction_axis == 0)
-    {
-        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-    }
-    NEScheduler::get().schedule(&_reduction_kernel, _window_split);
-    if(_is_reshape_required)
+    MemoryGroupResourceScope scope_mg(_memory_group);
+    NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
+    if (_is_reshape_required)
     {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+        _reshape.run();
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
deleted file mode 100644
index 12c9f7b4e8..0000000000
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
-
-    auto k = arm_compute::support::cpp14::make_unique<NERemapKernel>();
-
-    k->configure(input, map_x, map_y, output, policy);
-
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp
index 3cb0dc1762..89cf575f38 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEReorderLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,33 +21,46 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
+#if defined(__aarch64__)
+
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
 
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/core/NEON/kernels/NEReorderKernel.h"
+
 namespace arm_compute
 {
-NEIm2Col::NEIm2Col()
-    : _kernel(), _y_dim(1)
+NEReorderLayer::~NEReorderLayer() = default;
+
+NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>())
 {
 }
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
+void NEReorderLayer::configure(const ITensor            *input,
+                               ITensor                  *output,
+                               arm_compute::WeightFormat input_wf,
+                               arm_compute::WeightFormat output_wf)
 {
-    _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-
-    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    auto k = std::make_unique<NEReorderKernel>();
+    k->configure(input, output, input_wf, output_wf);
+    _reorder_kernel = std::move(k);
 }
 
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
+void NEReorderLayer::run()
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    // Run Reorder
+    NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX);
 }
 
-void NEIm2Col::run()
+Status NEReorderLayer::validate(const ITensorInfo        *input,
+                                const ITensorInfo        *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
 {
-    NEScheduler::get().schedule(&_kernel, _y_dim);
+    return NEReorderKernel::validate(input, output, input_wf, output_wf);
 }
+
 } // namespace arm_compute
+
+#endif // defined(__aarch64__)
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index dc8f5f1f66..14e41d6df4 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 
 namespace arm_compute
 {
 void NEReorgLayer::configure(const ITensor *input, ITensor *output, int32_t stride)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEReorgLayerKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, stride);
+
+    auto k = std::make_unique<NEReorgLayerKernel>();
     k->configure(input, output, stride);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 0a9f42d510..bed70ff66c 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,26 +23,51 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
+
+#include "src/cpu/operators/CpuReshape.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct NEReshapeLayer::Impl
+{
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuReshape> op{nullptr};
+};
+
+NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>())
+{
+}
+NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&)            = default;
+NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
+NEReshapeLayer::~NEReshapeLayer()                            = default;
+
 void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuReshape>();
+    _impl->op->configure(input->info(), output->info());
 }
 
 Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuReshape::validate(input, output));
 
     return Status{};
 }
+
+void NEReshapeLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index a950826270..a90f8d2e76 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,20 +23,25 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 
 namespace arm_compute
 {
-void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEReverseKernel>();
-    k->configure(input, output, axis);
+    ARM_COMPUTE_LOG_PARAMS(input, output, axis);
+
+    auto k = std::make_unique<NEReverseKernel>();
+    k->configure(input, output, axis, use_inverted_axis);
     _kernel = std::move(k);
 }
 
-Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status NEReverse::validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           bool               use_inverted_axis)
 {
-    return NEReverseKernel::validate(input, output, axis);
+    return NEReverseKernel::validate(input, output, axis, use_inverted_axis);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index acde0cfcc5..0d011064f6 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,220 +23,122 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
+#include "arm_compute/runtime/Tensor.h"
 
-#include <cmath>
-#include <cstddef>
-#include <utility>
+#include "src/common/utils/Log.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/cpu/operators/CpuScale.h"
 
-using namespace arm_compute;
-
-namespace
+namespace arm_compute
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
+struct NEScale::Impl
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == offsets);
-    ARM_COMPUTE_UNUSED(sampling_policy);
-    float sampling_offset = 0.0f;
-    if(sampling_policy == SamplingPolicy::CENTER)
-    {
-        sampling_offset = 0.5f;
-    }
-
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
-    win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
-
-    if(dx != nullptr && dy != nullptr)
-    {
-        // Pre-compute the offset and pixel's distance for BILINEAR interpolation
-        Iterator offsets_it(offsets, win);
-        Iterator dx_it(dx, win);
-        Iterator dy_it(dy, win);
-
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
-            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
-            const int   in_xi = std::floor(in_x);
-            const int   in_yi = std::floor(in_y);
-
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size);
-            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
-            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
-        },
-        offsets_it, dx_it, dy_it);
-    }
-    else
-    {
-        // Pre-compute the offset for NEAREST interpolation
-        Iterator offsets_it(offsets, win);
-
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const size_t in_xi = std::floor((id.x() + sampling_offset) * wr);
-
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
-        },
-        offsets_it);
-    }
-}
-} // namespace
-
-NEScale::NEScale() // NOLINT
-    : _offsets(),
-      _dx(),
-      _dy(),
-      _scale_kernel(),
-      _border_handler(),
-      _use_padding(true),
-      _align_corners(false)
+    const ITensor *src{nullptr};
+    ITensor       *dst{nullptr};
+    Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+    Tensor offsets{
+        nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    std::unique_ptr<cpu::CpuScale> op{nullptr};
+};
+
+NEScale::NEScale() : _impl(std::make_unique<Impl>())
 {
 }
+NEScale::~NEScale() = default;
 
 void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(input, output, info);
 
-    _use_padding   = info.use_padding;
-    _align_corners = info.interpolation_policy == InterpolationPolicy::BILINEAR
-                     && info.sampling_policy == SamplingPolicy::TOP_LEFT
-                     && info.align_corners;
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuScale>();
+    _impl->op->configure(input->info(), output->info(), info);
 
+    // Configure for size of allocation of internal tensors
     // Get data layout and width/height indices
-    const DataLayout data_layout = input->info()->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Get the tensor shape
-    const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
+    const DataLayout data_layout =
+        info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
-    const auto hr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
-
-    // Get the element size of the input image
-    const size_t input_element_size = input->info()->element_size();
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
-
-    switch(policy_to_use)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            TensorInfo tensor_info_offsets(shape, Format::S32);
-            _offsets.allocator()->init(tensor_info_offsets);
-
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, info);
-
-            // Allocate once the configure methods have been called
-            _offsets.allocator()->allocate();
-
-            // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy);
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            TensorInfo tensor_info_offsets(shape, Format::S32);
-            TensorInfo tensor_info_dxdy(shape, Format::F32);
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
 
-            _offsets.allocator()->init(tensor_info_offsets);
-            _dx.allocator()->init(tensor_info_dxdy);
-            _dy.allocator()->init(tensor_info_dxdy);
+    // Get the tensor shape
+    TensorShape shape(output->info()->dimension(idx_width));
+    shape.set(1, output->info()->dimension(idx_height), false);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, info);
+    bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+        data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
 
-            // Allocate once the configure methods have been called
-            _offsets.allocator()->allocate();
-            _dx.allocator()->allocate();
-            _dy.allocator()->allocate();
+    if (precompute_indices_weights)
+    {
+        const TensorInfo tensor_info_dxdy(shape, Format::F32);
+        const TensorInfo tensor_info_offsets(shape, Format::S32);
 
-            // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy);
-            break;
-        }
-        case InterpolationPolicy::AREA:
+        _impl->dx.allocator()->init(tensor_info_dxdy);
+        _impl->dy.allocator()->init(tensor_info_dxdy);
+        _impl->offsets.allocator()->init(tensor_info_offsets);
+        switch (policy_to_use)
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, info);
-            break;
+            case InterpolationPolicy::NEAREST_NEIGHBOR:
+            {
+                // Allocate once the configure methods have been called
+                _impl->offsets.allocator()->allocate();
+                break;
+            }
+            case InterpolationPolicy::BILINEAR:
+            {
+                // Allocate once the configure methods have been called
+                _impl->dx.allocator()->allocate();
+                _impl->dy.allocator()->allocate();
+                _impl->offsets.allocator()->allocate();
+                break;
+            }
+            case InterpolationPolicy::AREA:
+            {
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Unsupported interpolation mode");
         }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
-    if(info.use_padding)
+    else
     {
-        _border_handler.configure(input, _scale_kernel.border_size(), info.border_mode, info.constant_border_value);
+        if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR &&
+            policy_to_use != InterpolationPolicy::AREA)
+        {
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+        }
     }
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
-                        bool align_corners)
-{
-    configure(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
-}
-
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
-
-    ITensorInfo *offsets = nullptr;
-    ITensorInfo *dx      = nullptr;
-    ITensorInfo *dy      = nullptr;
-
-    // Get data layout and width/height indices
-    const DataLayout data_layout = input->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Get the tensor shape of auxilary buffers
-    const TensorShape shape(output->dimension(idx_width), output->dimension(idx_height));
-
-    TensorInfo tensor_info_offsets(shape, Format::S32);
-    TensorInfo tensor_info_dx(shape, Format::F32);
-    TensorInfo tensor_info_dy(shape, Format::F32);
-
-    switch(info.interpolation_policy)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-            offsets = &tensor_info_offsets;
-            break;
-        case InterpolationPolicy::BILINEAR:
-            offsets = &tensor_info_offsets;
-            dx      = &tensor_info_dx;
-            dy      = &tensor_info_dy;
-            break;
-        default:
-            break;
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(), info));
-    return Status{};
-}
-
-Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
-                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(NEScale::validate(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners }));
-    return Status{};
+    return cpu::CpuScale::validate(input, output, info);
 }
 
 void NEScale::run()
 {
-    if(_use_padding)
-    {
-        NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-    }
-    NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    pack.add_tensor(TensorType::ACL_INT_0, &_impl->dx);
+    pack.add_tensor(TensorType::ACL_INT_1, &_impl->dy);
+    pack.add_tensor(TensorType::ACL_INT_2, &_impl->offsets);
+    _impl->op->run(pack);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
deleted file mode 100644
index b7a99ff55a..0000000000
--- a/src/runtime/NEON/functions/NEScharr3x3.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
-
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEScharr3x3Kernel>();
-    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index 8587f7f28e..55cad2202b 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,18 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
 
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NESelectKernel.h"
 
 namespace arm_compute
 {
 void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NESelectKernel>();
+    ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
+
+    auto k = std::make_unique<NESelectKernel>();
     k->configure(c, x, y, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
deleted file mode 100644
index a4b0dfffaa..0000000000
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT
-    : _kernel()
-{
-}
-
-void NESimpleAssemblyFunction::run()
-{
-    NEScheduler::get().schedule(_kernel.get(), Window::DimX);
-}
-
-void NESimpleAssemblyFunction::configure(std::unique_ptr<INEGEMMWrapperKernel> kernel)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get());
-    _kernel = std::move(kernel);
-    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1);
-}
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 5da8896d6f..12d43adc84 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,40 +24,87 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/Validate.h"
 
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
 namespace arm_compute
 {
-void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+namespace experimental
+{
+void NESlice::configure(const ITensorInfo *input,
+                        ITensorInfo       *output,
+                        const Coordinates &starts,
+                        const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
-    auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+    auto k = std::make_unique<NEStridedSliceKernel>();
     k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
     _kernel = std::move(k);
 }
 
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
     return NEStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
+} // namespace experimental
+
+struct NESlice::Impl
+{
+    const ITensor                         *src{nullptr};
+    ITensor                               *dst{nullptr};
+    std::unique_ptr<experimental::NESlice> op{nullptr};
+};
+
+NESlice::NESlice() : _impl(std::make_unique<Impl>())
+{
+}
+NESlice::NESlice(NESlice &&)            = default;
+NESlice &NESlice::operator=(NESlice &&) = default;
+NESlice::~NESlice()                     = default;
+
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
+{
+    return experimental::NESlice::validate(input, output, starts, ends);
+}
+
+void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<experimental::NESlice>();
+    _impl->op->configure(input->info(), output->info(), starts, ends);
+}
+
+void NESlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
deleted file mode 100644
index ca80ccd82e..0000000000
--- a/src/runtime/NEON/functions/NESobel3x3.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
-
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NESobel3x3Kernel>();
-    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
-}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
deleted file mode 100644
index 2ddfee5028..0000000000
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
-{
-}
-
-void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void NESobel5x5::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
deleted file mode 100644
index b47a37aedb..0000000000
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
-{
-}
-
-void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    const bool run_sobel_x = output_x != nullptr;
-    const bool run_sobel_y = output_y != nullptr;
-
-    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S32);
-
-    if(run_sobel_x && run_sobel_y)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-        _tmp_y.allocator()->allocate();
-    }
-    else if(run_sobel_x)
-    {
-        _tmp_x.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _tmp_x.allocator()->allocate();
-    }
-    else if(run_sobel_y)
-    {
-        _tmp_y.allocator()->init(tensor_info);
-        _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
-        _tmp_y.allocator()->allocate();
-    }
-
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
-}
-
-void NESobel7x7::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
-}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 57d75af779..be588c5b52 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,192 +23,74 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "utils/TypePrinter.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
 
-#include <cfloat>
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
+#include "src/cpu/operators/CpuSoftmax.h"
 
 namespace arm_compute
 {
 template <bool IS_LOG>
-NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
-      _output_flattened(), _needs_flattening(false)
+struct NESoftmaxLayerGeneric<IS_LOG>::Impl
 {
-}
+    const ITensor                          *src{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr};
+    MemoryGroup                             memory_group{};
+    ITensorPack                             run_pack{};
+    WorkspaceData<Tensor>                   workspace_tensors{};
+};
 
 template <bool IS_LOG>
-void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t axis)
+NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
+    : _impl(std::make_unique<Impl>())
 {
-    // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
-
-    // Initialize the flat input
-    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
-    // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
-    // If flattening on the third axes, we use NEFlattenKernel.
-    // In all other cases we have to use NEReshapeKernel
-    if(axis != 3)
-    {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
-        reshape_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
-    }
-    else
-    {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
-        flatten_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
-    }
-
-    // We need to init the output tensor here. Indeed, the reshape kernel expects
-    // both tensors to be already initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
+    _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
 
 template <bool IS_LOG>
+NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
+template <bool IS_LOG>
+NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default;
+template <bool IS_LOG>
+NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
+
+template <bool IS_LOG>
 void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, float beta, int32_t axis)
 {
-    // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis));
-
-    // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
-    axis = wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions()));
-
-    // We don't need flattening only in the case the input is 2D and axis is 1
-    _needs_flattening = axis != 1;
-
-    // If we are dealing with a 4D tensor, we will:
-    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
-    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
-    // - Reshape the flattened output into the real output
-    if(_needs_flattening)
-    {
-        // Add to the memory manager _input_flattened
-        _memory_group.manage(&_input_flattened);
-
-        // Configure  _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, axis);
-    }
-
-    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
-    // or it is the original input case (2D case)
-    ITensor *input_2D = (_needs_flattening ? &_input_flattened : input);
-
-    // Create intermediate tensors shapes
-    const TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type();
-    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
 
-    // Init intermediate tensors
-    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
-    max_sum_shape.set(0, 1);
-    _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
-    _tmp.allocator()->init(tensor_info_tmp);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric>();
+    _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG);
 
-    // Manage intermediate buffers
-    _memory_group.manage(&_max);
-    _memory_group.manage(&_tmp);
-
-    // Configure Kernels
-    _max_kernel.configure(input_2D, &_max);
-    if(_needs_flattening)
-    {
-        // Add to the memory manager _output_flattened
-        _memory_group.manage(&_output_flattened);
-
-        // The normalization kernel stores the result in a flat output tensor
-        _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp);
-        _input_flattened.allocator()->allocate();
-
-        // Reshape the flat output into the requested (4D) output
-        _reshape_kernel.configure(&_output_flattened, output);
-
-        // Allocate the intermediate flat tensors
-        _output_flattened.allocator()->allocate();
-    }
-    else
-    {
-        // Softmax 2D case
-        _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE);
-        _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp);
-    }
-
-    // Allocate intermediate buffers
-    _max.allocator()->allocate();
-    _tmp.allocator()->allocate();
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
-    // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis);
-
-    // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
-    axis = wrap_around(axis, static_cast<int32_t>(input->num_dimensions()));
-
-    // Create intermediate tensor info
-    DataType         tmp_data_type = input->data_type();
-    const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = input->tensor_shape();
-    max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
-    const TensorInfo dont_care;
-
-    const bool needs_flattening = (axis != 1);
-
-    if(needs_flattening)
-    {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-
-        if(axis != 3)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
-
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG));
     return Status{};
 }
 
 template <bool IS_LOG>
-void           NESoftmaxLayerGeneric<IS_LOG>::run()
+void NESoftmaxLayerGeneric<IS_LOG>::run()
 {
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_needs_flattening)
-    {
-        NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
-    }
-
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_max_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
-
-    if(_needs_flattening)
-    {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
-    }
+    // Acquire all the temporaries
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+    _impl->op->run(_impl->run_pack);
 }
 
 template class NESoftmaxLayerGeneric<false>;
 template class NESoftmaxLayerGeneric<true>;
 
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index 205bc910a5..556ebdd800 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,50 +28,76 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NEFill.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+
 namespace arm_compute
 {
-NESpaceToBatchLayer::NESpaceToBatchLayer()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
+
+NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
 {
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const ITensor *block_shape,
+                                    const ITensor *paddings,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+    ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill_f      = std::make_unique<NEFill>();
+        _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+    _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>();
+    _space_to_batch_kernel->configure(input, block_shape, paddings, output);
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const int      block_shape_x,
+                                    const int      block_shape_y,
+                                    const Size2D  &padding_left,
+                                    const Size2D  &padding_right,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill_f      = std::make_unique<NEFill>();
+        _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>();
+    _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -79,10 +105,10 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
 void NESpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
-        NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+        _fill_f->run();
     }
-    NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+    NEScheduler::get().schedule(_space_to_batch_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index 18d82918c7..846b619429 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,17 +30,24 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
+
 namespace arm_compute
 {
-NESpaceToDepthLayer::NESpaceToDepthLayer()
-    : _space_to_depth_kernel()
+NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
+
+NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel()
 {
 }
 
 void NESpaceToDepthLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _space_to_depth_kernel.configure(input, output, block_shape);
+    ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+
+    _space_to_depth_kernel = std::make_unique<NESpaceToDepthLayerKernel>();
+    _space_to_depth_kernel->configure(input, output, block_shape);
 }
 
 Status NESpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -51,6 +58,6 @@ Status NESpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void NESpaceToDepthLayer::run()
 {
-    NEScheduler::get().schedule(&_space_to_depth_kernel, Window::DimY);
+    NEScheduler::get().schedule(_space_to_depth_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index 8131e47e3f..53b09e9ae5 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ namespace arm_compute
 {
 void NESplit::run()
 {
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 351497c96e..2f88ffca2a 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,27 +31,26 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEStackLayerKernel.h"
+
 namespace arm_compute
 {
+NEStackLayer::~NEStackLayer() = default;
+
 NEStackLayer::NEStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _stack_kernel(std::make_unique<NEStackLayerKernel>()), _is_prepared(false)
 {
 }
 
 void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
 {
-    _num_inputs = input.size();
-    _stack_kernels.resize(_num_inputs);
+    ARM_COMPUTE_LOG_PARAMS(input, axis, output);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
-    {
-        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
-    }
+    _stack_kernel->configure(input, axis_u, output);
 }
 
 Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
@@ -63,24 +62,20 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
     const size_t       rank   = input[0]->num_dimensions();
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
 
-    const unsigned int num_inputs = input.size();
-
-    for(unsigned int i = 0; i < num_inputs; i++)
-    {
-        // All the tensors must have the same rank
-        ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
-        // Validate Kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
-    }
+    // Validate Kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input, axis_u, output));
 
     return Status{};
 }
 
 void NEStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    if (!_is_prepared)
     {
-        NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY);
+        _stack_kernel->prepare();
+        _is_prepared = true;
     }
+
+    NEScheduler::get().schedule(_stack_kernel.get(), _stack_kernel->get_split_dimension());
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index c9be563e17..6a3ac8be05 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,92 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
 namespace arm_compute
 {
-void NEStridedSlice::configure(const ITensor *input, ITensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+namespace experimental
+{
+void NEStridedSlice::configure(const ITensorInfo *input,
+                               ITensorInfo       *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+
+    auto k = std::make_unique<NEStridedSliceKernel>();
     k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     _kernel = std::move(k);
 }
 
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
+} // namespace experimental
+
+struct NEStridedSlice::Impl
+{
+    const ITensor                                *src{nullptr};
+    ITensor                                      *dst{nullptr};
+    std::unique_ptr<experimental::NEStridedSlice> op{nullptr};
+};
+
+NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>())
+{
+}
+NEStridedSlice::NEStridedSlice(NEStridedSlice &&)            = default;
+NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default;
+NEStridedSlice::~NEStridedSlice()                            = default;
+
+void NEStridedSlice::configure(const ITensor     *input,
+                               ITensor           *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<experimental::NEStridedSlice>();
+    _impl->op->configure(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+void NEStridedSlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
+{
+    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
deleted file mode 100644
index 44cbbc8416..0000000000
--- a/src/runtime/NEON/functions/NETableLookup.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
-
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NETableLookupKernel>();
-    k->configure(input, lut, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
deleted file mode 100644
index f4fd85722c..0000000000
--- a/src/runtime/NEON/functions/NEThreshold.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
-
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEThresholdKernel>();
-    k->configure(input, output, threshold, false_value, true_value, type, upper);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 6bf8eaad20..d10b1c8e95 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETile.h"
 
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
-#include "support/MemorySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 
 namespace arm_compute
 {
 void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NETileKernel>();
+    ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
+
+    auto k = std::make_unique<NETileKernel>();
     k->configure(input, output, multiples);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 21d496300a..0144a85e8c 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,22 +23,50 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
 
-#include <utility>
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuTranspose.h"
 
 namespace arm_compute
 {
+struct NETranspose::Impl
+{
+    const ITensor                     *src{nullptr};
+    ITensor                           *dst{nullptr};
+    std::unique_ptr<cpu::CpuTranspose> op{nullptr};
+};
+
+NETranspose::NETranspose() : _impl(std::make_unique<Impl>())
+{
+}
+
+NETranspose::~NETranspose() = default;
+
 void NETranspose::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_LOG_PARAMS(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuTranspose>();
+    _impl->op->configure(input->info(), output->info());
 }
 
 Status NETranspose::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return NETransposeKernel::validate(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuTranspose::validate(input, output));
+    return Status{};
 }
+
+void NETranspose::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 21f35f8312..2f7ed2bb1f 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
+#include "src/common/utils/Log.h"
+
 namespace arm_compute
 {
 namespace
@@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -54,22 +58,23 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
 } // namespace
 
 NEUnstack::NEUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
 void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
 {
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ITensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
+    ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_axis(axis, input->info());
@@ -79,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &ou
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0,
+                                               slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -100,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
 
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void NEUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
deleted file mode 100644
index 469ca65bd9..0000000000
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
-#include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEWarpAffine::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    switch(policy)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
-            k->configure(input, output, matrix, border_mode, constant_border_value);
-            _kernel = std::move(k);
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
-            k->configure(input, output, matrix, border_mode, constant_border_value);
-            _kernel = std::move(k);
-            break;
-        }
-        case InterpolationPolicy::AREA:
-        default:
-            ARM_COMPUTE_ERROR("Interpolation type not supported");
-    }
-
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
-}
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
deleted file mode 100644
index ac5edca125..0000000000
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
-#include "arm_compute/core/Validate.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEWarpPerspective::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    switch(policy)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
-            k->configure(input, output, matrix, border_mode, constant_border_value);
-            _kernel = std::move(k);
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
-            k->configure(input, output, matrix, border_mode, constant_border_value);
-            _kernel = std::move(k);
-            break;
-        }
-        case InterpolationPolicy::AREA:
-        default:
-            ARM_COMPUTE_ERROR("Interpolation type not supported");
-    }
-
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
-}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index d567a18709..7334be8456 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,749 +23,94 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/Validate.h"
 
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
 
 namespace arm_compute
 {
-namespace
-{
-inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-    if(input->data_type() == DataType::F32)
-    {
-        if(input_dims.width > 4 && input_dims.height > 4)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, input0, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
-        }
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(input->data_type() == DataType::F32)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
-    }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, output, winograd_info)));
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>::validate(input, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info)));
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>::validate(input, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info)));
-
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
+using namespace arm_compute::experimental;
 
-inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+struct NEWinogradConvolutionLayer::Impl
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>::validate(input, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info)));
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
-inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>::validate(input, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info)));
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>::validate(input, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info)));
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
-                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>::validate(input, input0, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
-    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info)));
-
-    if(act_info.enabled())
-    {
-        NEActivationLayer::validate(output, nullptr, act_info);
-    }
-    return Status{};
-}
-
-inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
-{
-    const DataLayout data_layout = input->info()->data_layout();
-    const int        in_width    = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
-    const int        in_height   = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
-    const int        in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    const int        in_batches  = input->info()->dimension(3);
-
-    return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-    return INEWinogradLayerTransformWeightsKernel::validate(input, weights);
-}
-
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
-{
-    Size2D output_tile = Size2D{};
-    if(kernel_dims == Size2D(3U, 3U))
-    {
-        output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
-        if(data_type == DataType::F16)
-        {
-            output_tile = Size2D(4U, 4U);
-        }
-    }
-    else if(kernel_dims == Size2D(5U, 5U))
-    {
-        output_tile = Size2D(2U, 2U);
-    }
-    else if(kernel_dims == Size2D(1U, 3U))
-    {
-        output_tile = Size2D(1U, 6U);
-    }
-    else if(kernel_dims == Size2D(3U, 1U))
-    {
-        output_tile = Size2D(6U, 1U);
-    }
-    else if(kernel_dims == Size2D(1U, 5U))
-    {
-        output_tile = Size2D(1U, 4U);
-    }
-    else if(kernel_dims == Size2D(5U, 1U))
-    {
-        output_tile = Size2D(4U, 1U);
-    }
-    else if(kernel_dims == Size2D(7U, 1U))
-    {
-        output_tile = Size2D(2U, 1U);
-    }
-    else if(kernel_dims == Size2D(1U, 7U))
-    {
-        output_tile = Size2D(1U, 2U);
-    }
-    return output_tile;
-}
-
-bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
-{
-    // Check if we want to configure a Winograd configuration which requires fast math
-    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
-    {
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
-    };
-
-    const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
-    {
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
-    };
-
-    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
-                            std::pair<int, int>(kernel_size.width, kernel_size.height));
-
-    switch(data_type)
-    {
-        case DataType::F16:
-            return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
-        case DataType::F32:
-            return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
-        default:
-            return false;
-    }
-}
-
-inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
-{
-    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
-}
-
-arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
-{
-    switch(act_info.activation())
-    {
-        case ActivationLayerInfo::ActivationFunction::RELU:
-        {
-            return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
-        }
-        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-        {
-            return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
-        }
-        default:
-        {
-            return arm_gemm::Activation(arm_gemm::Activation::Type::None);
-        }
-    }
-}
-} //namespace
+    MemoryGroup                             memory_group{};
+    std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr};
+    ITensorPack                             run_pack{};
+    ITensorPack                             prep_pack{};
+    WorkspaceData<Tensor>                   workspace{};
+    experimental::MemoryRequirements        aux_mem_req{};
+    const ITensor                          *original_weights{nullptr};
+    bool                                    is_prepared{false};
+    bool                                    is_activationlayer_enabled{false};
+    DataLayout                              data_layout{};
+};
 
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
-      _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
-      _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
+    : _impl(std::make_unique<Impl>())
 {
+    _impl->memory_group = MemoryGroup(memory_manager);
 }
 
-void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
-
-    // Get indices for the width and height
-    const DataLayout   data_layout = input->info()->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    const Size2D   input_dims  = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
-    const Size2D   kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
-    const DataType data_type   = input->info()->data_type();
-    const Size2D   output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
-                                 "This Winograd configuration requires enable_fast_math=true");
-    }
-
-    _weights     = weights;
-    _input       = input;
-    _output      = output;
-    _is_prepared = false;
-
-    int n_gemms = 0;
-    int N_BLOCK = 0; // Size of block used by GEMM.
-
-    std::unique_ptr<INEWinogradLayerTransformInputKernel>   transform_input_kernel;
-    std::unique_ptr<INEWinogradLayerTransformWeightsKernel> transform_weights_kernel;
-    std::unique_ptr<INEWinogradLayerTransformOutputKernel>  transform_output_kernel;
-
-    if(data_type == DataType::F32)
-    {
-        if(kernel_size == Size2D(3, 3))
-        {
-            if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
-            {
-                using config             = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
-                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-                n_gemms                  = config::WinogradBase::N_GEMMS;
-                N_BLOCK                  = config::WinogradConv::N_BLOCK;
-            }
-            else
-            {
-                using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
-                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-                n_gemms                  = config::WinogradBase::N_GEMMS;
-                N_BLOCK                  = config::WinogradConv::N_BLOCK;
-            }
-        }
-        else if(kernel_size == Size2D(5, 5))
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(1, 3))
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 6, 1, 3, 1>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(3, 1))
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 1, 6, 1, 3>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(1, 5))
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 4, 1, 5, 1>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(5, 1))
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 1, 4, 1, 5>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(1, 7))
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 2, 1, 7, 1>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else if(kernel_size == Size2D(7, 1))
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 1, 2, 1, 7>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-        }
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(data_type == DataType::F16)
-    {
-        if(kernel_size == Size2D(3, 3))
-        {
-            using config             = NEWinogradLayerConfiguration<__fp16, __fp16, 4, 4, 3, 3>;
-            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-            n_gemms                  = config::WinogradBase::N_GEMMS;
-            N_BLOCK                  = config::WinogradConv::N_BLOCK;
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-        }
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-    const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
-    const bool        use_same_padding = use_padding_type == PADDING_SAME;
-
-    // Get convolved dimensions
-    const int in_channels  = input->info()->dimension(channel_idx);
-    const int out_channels = output->info()->dimension(channel_idx);
-
-    const Tensor4DShape in_shape(internal_get_input_shape(input));
-    const size_t        data_type_size = input->info()->element_size();
-    // Get the memory required to instantiate a new Winograd operator.
-    constexpr size_t storage_alignment = 64;
-
-    // Kernel Storage
-    const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
-                                                                                         in_channels)
-                                       * data_type_size;
-
-    // Input storage
-    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
-                                                                                     use_same_padding)
-                                      * data_type_size;
-
-    // Output storage
-    const size_t output_storage_size  = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
-    const int    kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
-    const int    output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
-    const auto   output_shape         = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
-    const int    input_matrix_stride  = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
-
-    // Configure GEMM
-    const int tile_rows                = iceildiv(output_shape.first, output_tile.height);
-    const int tile_cols                = iceildiv(output_shape.second, output_tile.width);
-    const int m                        = in_shape.n_batches * tile_rows * tile_cols;
-    const int k                        = in_shape.n_channels;
-    const int n                        = out_channels;
-    const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
-    const int output_matrix_row_stride = kernel_matrix_row_stride;
-
-    TensorShape a_shape(k, m, 1, n_gemms);
-    Strides     a_strides(data_type_size);
-    a_strides.set(1, a_strides[0] * k);
-    //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
-    a_strides.set(2, 0);
-    a_strides.set(3, data_type_size * input_matrix_stride);
-
-    TensorShape b_shape(n, k, n_gemms);
-    Strides     b_strides(data_type_size);
-    b_strides.set(1, data_type_size * kernel_matrix_row_stride);
-    b_strides.set(2, data_type_size * kernel_matrix_stride);
-
-    TensorShape d_shape(n, m, 1, n_gemms);
-    Strides     d_strides(data_type_size);
-    d_strides.set(1, data_type_size * output_matrix_row_stride);
-    //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
-    d_strides.set(2, 0);
-    d_strides.set(3, data_type_size * output_matrix_stride);
-
-    TensorInfo a_info{};
-    TensorInfo b_info{};
-    TensorInfo d_info{};
-    a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
-    b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
-    d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
-
-    _input_transformed.allocator()->init(a_info, storage_alignment);
-    _kernel_storage.allocator()->init(b_info, storage_alignment);
-    _output_transformed.allocator()->init(d_info, storage_alignment);
-
-    // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-    TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
-                                _output->info()->dimension(1), _output->info()->dimension(3)),
-                    1, _output->info()->data_type());
-    _output_nhwc.allocator()->init(info);
-
-    const ITensor     *input_to_use  = _input;
-    ITensor           *output_to_use = _output;
-    PermutationVector  weights_permutation_vector(3U, 0U, 1U, 2U);
-    const unsigned int max_num_threads = NEScheduler::get().num_threads();
-
-    // Configure the kernel to transform the input tensor from NCHW -> NHWC
-    if(data_layout == DataLayout::NCHW)
-    {
-        _memory_group.manage(&_input_nhwc);
-        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-        input_to_use               = &_input_nhwc;
-        weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
-    }
-
-    // Configure input transform kernel
-    _memory_group.manage(&_input_transformed);
-    _memory_group.manage(&_input_workspace);
-    transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                      &_input_transformed, input_matrix_stride, &_input_workspace);
-    const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
-    TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
-    _input_workspace.allocator()->init(input_workspace_info);
-    _input_workspace.allocator()->allocate();
-    if(data_layout == DataLayout::NCHW)
-    {
-        _input_nhwc.allocator()->allocate();
-    }
-
-    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-    _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
-    transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
-    // Configure GEMM function
-    _memory_group.manage(&_output_transformed);
-    _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
-    _input_transformed.allocator()->allocate();
-
-    // Configure output transform function
-    // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-    if(data_layout == DataLayout::NCHW)
-    {
-        _memory_group.manage(&_output_nhwc);
-        output_to_use = &_output_nhwc;
-    }
-    const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
-
-    transform_output_kernel->configure(biases,
-                                       &_output_transformed,
-                                       output_matrix_stride,
-                                       output_to_use,
-                                       in_shape.n_batches,
-                                       output_shape.first,
-                                       output_shape.second,
-                                       out_channels,
-                                       &_output_workspace,
-                                       activation);
-
-    const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
-    TensorInfo   output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
-    _output_workspace.allocator()->init(output_workspace_info);
-    _output_workspace.allocator()->allocate();
-    _output_transformed.allocator()->allocate();
+NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default;
 
-    // Reorder the convoluted output to ACL's ordering NCHW
-    if(data_layout == DataLayout::NCHW)
-    {
-        _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-        _output_nhwc.allocator()->allocate();
-    }
-
-    _transform_input_kernel   = std::move(transform_input_kernel);
-    _transform_weights_kernel = std::move(transform_weights_kernel);
-    _transform_output_kernel  = std::move(transform_output_kernel);
+void NEWinogradConvolutionLayer::configure(const ITensor             *input,
+                                           const ITensor             *weights,
+                                           const ITensor             *biases,
+                                           ITensor                   *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
+{
+    _impl->original_weights = weights;
+    _impl->op               = std::make_unique<cpu::CpuWinogradConv2d>();
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         conv_info, act_info, enable_fast_math);
 
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled() && !fuse_function_supported(act_info);
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(_output, nullptr, act_info);
-    }
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->prep_pack   = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
 void NEWinogradConvolutionLayer::run()
 {
-    const DataLayout data_layout = _input->info()->data_layout();
-
     prepare();
 
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-        _permute_input.run();
-    }
-
-    // Transform input tensor to the winograd domain
-    NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
-
-    //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    _gemm_function.run();
-
-    // Transform output tensor to the spatial domain
-    NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        // Reorder the convoluted output to ACL's ordering NCHW
-        _permute_output.run();
-    }
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
-Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
-
-    // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
-    // Input shape, kernel size and output tile
-    const Size2D   input_dims  = Size2D(input->dimension(idx_width), input->dimension(idx_height));
-    const Size2D   kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
-    const DataType data_type   = input->data_type();
-    const Size2D   output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
-    // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
-                                        "This Winograd configuration requires enable_fast_math=true");
-    }
-
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    input->data_layout());
-
-    // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-    const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
-    // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
-    // Validate batched matrix multiply
-    TensorShape batched_mm_output_shape = input0.tensor_shape();
-    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
-    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-
-    if(kernel_size == Size2D(3, 3))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(5, 5))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
-        return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    if(kernel_size == Size2D(3, 1))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(1, 3))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(5, 1))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(1, 5))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(7, 1))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    else if(kernel_size == Size2D(1, 7))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
-        return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
-    }
+    return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
 
 void NEWinogradConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_impl->is_prepared)
     {
-        // Permute weights
-        _weights_hwio.allocator()->allocate();
-        _permute_weights.run();
-        _weights->mark_as_unused();
+        _impl->op->prepare(_impl->prep_pack);
+        _impl->original_weights->mark_as_unused();
 
-        // Transform weights
-        _kernel_storage.allocator()->allocate();
-        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
 
-        _weights_hwio.allocator()->free();
-        _is_prepared = true;
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
deleted file mode 100644
index e0094f4eec..0000000000
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ /dev/null
@@ -1,568 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/InfoHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace
-{
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
-                                                                        int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                        int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                        const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                        const qasymm8::QAsymm8RescaleParams &rescale_params,
-                                                                        int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
-                                                                                  int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                                  neon_convolution_kernels::ActivationFunction activation,
-                                                                                  const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
-                                                                                  const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
-                                                                                  int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
-                               n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
-                                                                     int n_batches, int in_rows, int in_cols, int n_channels,
-                                                                     int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
-                                                                     int padding_top, int padding_left, int padding_bottom, int padding_right)
-{
-    switch(kernel_size)
-    {
-        case 3:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        case 5:
-        {
-            switch(stride_x)
-            {
-                case 1:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                case 2:
-                    return arm_compute::support::cpp14::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
-                               n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-                default:
-                    return nullptr;
-            }
-        }
-        default:
-            return nullptr;
-    }
-}
-
-std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor      *input,
-                                                                   const ITensor      *weights,
-                                                                   ITensor            *output,
-                                                                   PadStrideInfo       conv_info,
-                                                                   ActivationLayerInfo act_info,
-                                                                   const Size2D       &dilation)
-{
-    ARM_COMPUTE_UNUSED(dilation);
-    const DataType    data_type = input->info()->data_type();
-    const TensorShape shape     = input->info()->tensor_shape();
-
-    const int n_batches       = shape[3];
-    const int in_rows         = shape.z();
-    const int in_cols         = shape.y();
-    const int n_channels      = shape.x();
-    const int dilation_factor = dilation.x();
-    const int padding_top     = conv_info.pad_top();
-    const int padding_left    = conv_info.pad_left();
-    const int padding_bottom  = conv_info.pad_bottom();
-    const int padding_right   = conv_info.pad_right();
-
-    const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QASYMM8);
-    const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
-
-    const unsigned int stride_x    = conv_info.stride().first;
-    const unsigned int kernel_size = weights->info()->tensor_shape().y();
-
-    // Map activation function
-    neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
-    if(arm_compute::utils::info_helpers::is_relu(act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU;
-    }
-    else if(arm_compute::utils::info_helpers::is_relu6(act_info))
-    {
-        activation = neon_convolution_kernels::ActivationFunction::ReLU6;
-    }
-
-    // Create quantized convolver
-    if(is_uniform_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo weights_qinfo = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo output_qinfo  = output->info()->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
-        const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int32_t     qmultiplier = 0;
-        int32_t     qshift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-        qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
-
-        return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
-                                     wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else if(is_perchannel_quantized)
-    {
-        const UniformQuantizationInfo input_qinfo   = input->info()->quantization_info().uniform();
-        const QuantizationInfo        weights_qinfo = weights->info()->quantization_info();
-        const UniformQuantizationInfo output_qinfo  = output->info()->quantization_info().uniform();
-
-        // Check that quantization info are in the range [0, 255]
-        ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
-        ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
-        const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
-        const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
-        const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
-
-        // Calculate rescale parameters
-        std::vector<float>   fmultipliers;
-        std::vector<int32_t> qmultipliers;
-        std::vector<int32_t> qshifts;
-
-        for(auto const s : wqinfo.scales)
-        {
-            const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
-            int32_t     qmultiplier = 0;
-            int32_t     qshift      = 0;
-            quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
-            fmultipliers.push_back(fmultipler);
-            qmultipliers.push_back(qmultiplier);
-            qshifts.push_back(qshift);
-        }
-
-        qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);
-
-        return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
-                                               wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
-    }
-    else
-    {
-        // Create float convolver
-        switch(data_type)
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F32:
-            {
-                return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
-            }
-            default:
-                return nullptr;
-        }
-    }
-}
-} // namespace
-
-struct NEDepthwiseConvolutionAssemblyDispatch::LocalImpl
-{
-    std::unique_ptr<depthwise::IDepthwiseConvolution> _dwc_assembly_kernel{ nullptr };
-    NEDepthwiseConvolutionAssemblyKernelWrapper       _dwc_acl_kernel{};
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false),
-      _pImpl(support::cpp14::make_unique<LocalImpl>())
-{
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-NEDepthwiseConvolutionAssemblyDispatch::~NEDepthwiseConvolutionAssemblyDispatch() = default;
-
-void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor             *input,
-                                                       const ITensor             *weights,
-                                                       const ITensor             *bias,
-                                                       ITensor                   *output,
-                                                       const PadStrideInfo       &conv_info,
-                                                       unsigned int               depth_multiplier,
-                                                       const ActivationLayerInfo &act_info,
-                                                       const Size2D              &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(depth_multiplier);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
-                                                                                weights->info(),
-                                                                                bias != nullptr ? bias->info() : nullptr,
-                                                                                output->info(),
-                                                                                conv_info,
-                                                                                depth_multiplier,
-                                                                                act_info,
-                                                                                dilation));
-
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
-
-    _input       = input;
-    _weights     = weights;
-    _bias        = bias;
-    _output      = output;
-    _is_prepared = false;
-
-    // Create convolver
-    _pImpl->_dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info, dilation);
-    ARM_COMPUTE_ERROR_ON(_pImpl->_dwc_assembly_kernel == nullptr);
-
-    // Create assembly kernel wrapper
-    _pImpl->_dwc_acl_kernel.configure(_pImpl->_dwc_assembly_kernel.get());
-
-    constexpr size_t alignment = 128;
-
-    // Create workspace
-    const unsigned int num_threads    = NEScheduler::get().num_threads();
-    const size_t       workspace_size = _pImpl->_dwc_assembly_kernel->get_working_space_size(num_threads);
-    ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
-    _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
-    _memory_group.manage(&_workspace);
-    _workspace.allocator()->allocate();
-
-    // Create packing tensor
-    const size_t pack_tensor_size = _pImpl->_dwc_assembly_kernel->get_packed_params_size();
-    ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
-    _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
-}
-
-Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo         *input,
-                                                        const ITensorInfo         *weights,
-                                                        const ITensorInfo         *bias,
-                                                        const ITensorInfo         *output,
-                                                        const PadStrideInfo       &conv_info,
-                                                        unsigned int               depth_multiplier,
-                                                        const ActivationLayerInfo &act_info,
-                                                        const Size2D              &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-    // Validate convolver
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation));
-
-    // Validate activation
-    const bool is_relu  = arm_compute::utils::info_helpers::is_relu(act_info);
-    const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
-
-    // Check bias
-    if(bias != nullptr)
-    {
-        unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
-    }
-
-    // Check output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    // The uniform quantization case will only have 1 scale value in the weights quantization info
-    const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
-    const QuantizationInfo        weights_qinfo = weights->quantization_info();
-    const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
-    for(auto const s : weights_qinfo.scale())
-    {
-        const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
-        ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
-    }
-
-    return Status{};
-}
-
-bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
-                                                                    const ITensorInfo *weights,
-                                                                    PadStrideInfo      conv_info,
-                                                                    unsigned int       depth_multiplier,
-                                                                    const Size2D      &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-
-    // Reshape input shape if in NHWC format
-    const DataLayout data_layout = input->data_layout();
-    TensorShape      in_shape{ input->tensor_shape() };
-    if(data_layout == DataLayout::NHWC)
-    {
-        in_shape.set(Window::DimX, input->tensor_shape().y());
-        in_shape.set(Window::DimY, input->tensor_shape().z());
-        in_shape.set(Window::DimZ, input->tensor_shape().x());
-    }
-
-    // Check data type
-    // TODO (COMPMID-3004): Add assembly optimized routine for QASYMM8_SIGNED NEDepthwiseConvolutionLayer
-    const DataType input_type            = input->data_type();
-    const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
-    const DataType weights_type          = weights->data_type();
-    const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
-                                           || weights_type == DataType::QSYMM8_PER_CHANNEL;
-
-    // Check weighs size
-    std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
-    const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int     kernel_w               = weights->dimension(width_idx);
-    const unsigned int     kernel_h               = weights->dimension(height_idx);
-    bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);
-
-    // Check for supported strides
-    const auto &strides           = conv_info.stride();
-    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
-
-    // Check for supported padding
-    const auto    pad_top           = conv_info.pad_top();
-    const auto    pad_right         = conv_info.pad_right();
-    const auto    pad_bottom        = conv_info.pad_bottom();
-    const auto    pad_left          = conv_info.pad_left();
-    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), conv_info, DataLayout::NCHW, dilation);
-    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
-    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
-    bool          supported_padding = is_same_padding || is_valid_padding;
-    // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
-    bool is_dilation_supported = ((dilation == Size2D(1U, 1U)) || ((dilation.x() == dilation.y()) && strides.first == 1));
-
-    if(weights_type == DataType::QSYMM8_PER_CHANNEL)
-    {
-        is_dilation_supported = is_dilation_supported && (dilation == Size2D(1U, 1U));
-    }
-
-    return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_supported;
-}
-
-void NEDepthwiseConvolutionAssemblyDispatch::run()
-{
-    // Prepare assembly kernel
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Setup inputs/outputs
-    ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
-    _pImpl->_dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
-
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-    const int   input_element_size = _input->info()->element_size();
-    const int   input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
-    const int   input_row_stride   = _input->info()->strides_in_bytes().z() / input_element_size;
-    const int   input_col_stride   = _input->info()->strides_in_bytes().y() / input_element_size;
-    const void *input_ptr          = _input->buffer() + _input->info()->offset_first_element_in_bytes();
-    _pImpl->_dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
-
-    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-    const int output_element_size = _output->info()->element_size();
-    const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
-    const int output_row_stride   = _output->info()->strides_in_bytes().z() / output_element_size;
-    const int output_col_stride   = _output->info()->strides_in_bytes().y() / output_element_size;
-    void     *output_ptr          = _output->buffer() + _output->info()->offset_first_element_in_bytes();
-    _pImpl->_dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
-
-    // Schedule assembly kernel
-    NEScheduler::get().schedule(&_pImpl->_dwc_acl_kernel, Window::DimX);
-}
-
-void NEDepthwiseConvolutionAssemblyDispatch::prepare()
-{
-    if(!_is_prepared)
-    {
-        _packed_weights.allocator()->allocate();
-        ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
-
-        // Pack weights and bias
-        const int weights_element_size = _weights->info()->element_size();
-        const int weights_row_stride   = _weights->info()->strides_in_bytes().z() / weights_element_size;
-        const int weights_col_stride   = _weights->info()->strides_in_bytes().y() / weights_element_size;
-        _pImpl->_dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
-                                                  _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
-                                                  weights_row_stride,
-                                                  weights_col_stride,
-                                                  (_bias != nullptr) ? _bias->buffer() : nullptr);
-        _pImpl->_dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
-
-        _weights->mark_as_unused();
-        if(_bias != nullptr)
-        {
-            _bias->mark_as_unused();
-        }
-        _is_prepared = true;
-    }
-}
-} // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index f67f06fc94..2a5abb5f7a 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,16 +27,29 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
 
 #include <omp.h>
 
 namespace arm_compute
 {
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
 OMPScheduler::OMPScheduler() // NOLINT
-    : _num_threads(omp_get_max_threads())
+    : _num_threads(cpu_info().get_cpu_num_excluding_little()),
+      _has_lmb(cpu_info().cpu_has_little_mid_big()),
+      _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
 {
 }
+#else  /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+OMPScheduler::OMPScheduler() // NOLINT
+    : _num_threads(omp_get_max_threads()),
+      _has_lmb(cpu_info().cpu_has_little_mid_big()),
+      _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little())
+{
+}
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
 
 unsigned int OMPScheduler::num_threads() const
 {
@@ -46,60 +59,78 @@ unsigned int OMPScheduler::num_threads() const
 void OMPScheduler::set_num_threads(unsigned int num_threads)
 {
     const unsigned int num_cores = omp_get_max_threads();
-    _num_threads                 = (num_threads == 0) ? num_cores : num_threads;
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)
+    const unsigned int adjusted_num_threads = (_has_lmb) ? _nonlittle_num_cpus : num_threads;
+    _num_threads                            = (num_threads == 0) ? num_cores : adjusted_num_threads;
+#else  /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
+    _num_threads = (num_threads == 0) ? num_cores : num_threads;
+#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/
 }
 
 void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
+    ITensorPack tensors;
+    schedule_common(kernel, hints, kernel->window(), tensors);
+}
+
+void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)
+{
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
     ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC,
                              "Dynamic scheduling is not supported in OMPScheduler");
 
-    const Window      &max_window     = kernel->window();
+    const Window      &max_window     = window;
     const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
     const unsigned int num_threads    = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || num_threads == 1)
+    if (!kernel->is_parallelisable() || num_threads == 1)
     {
         ThreadInfo info;
-        info.cpu_info = &_cpu_info;
-        kernel->run(max_window, info);
+        info.cpu_info = &cpu_info();
+        kernel->run_op(tensors, max_window, info);
     }
     else
     {
         const unsigned int                num_windows = num_threads;
         std::vector<IScheduler::Workload> workloads(num_windows);
-        for(unsigned int t = 0; t < num_windows; t++)
+        for (unsigned int t = 0; t < num_windows; t++)
         {
             //Capture 't' by copy, all the other variables by reference:
-            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
             {
                 Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                 win.validate();
-                kernel->run(win, info);
+                kernel->run_op(tensors, win, info);
             };
         }
         run_workloads(workloads);
     }
 }
-
 #ifndef DOXYGEN_SKIP_THIS
 void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
 {
-    const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
-    if(num_threads < 1)
+    const unsigned int amount_of_work     = static_cast<unsigned int>(workloads.size());
+    const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work);
+
+    if (num_threads_to_use < 1)
     {
         return;
     }
 
     ThreadInfo info;
-    info.cpu_info    = &_cpu_info;
-    info.num_threads = num_threads;
-    #pragma omp parallel firstprivate(info) num_threads(num_threads)
+    info.cpu_info    = &cpu_info();
+    info.num_threads = num_threads_to_use;
+#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \
+    schedule(static, 1)
+    for (unsigned int wid = 0; wid < amount_of_work; ++wid)
     {
-        const int tid  = omp_get_thread_num();
+        const int tid = omp_get_thread_num();
+
         info.thread_id = tid;
-        workloads[tid](info);
+        workloads[wid](info);
     }
 }
 #endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index 3133202bf3..d746f618b5 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
 #include "arm_compute/runtime/OffsetMemoryPool.h"
-#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <cmath>
@@ -44,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment)
     return (remainder != 0U) ? offset + (alignment - remainder) : offset;
 }
 } // namespace
-OffsetLifetimeManager::OffsetLifetimeManager()
-    : _blob(0)
+OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0)
 {
 }
 
@@ -57,7 +55,7 @@ const OffsetLifetimeManager::info_type &OffsetLifetimeManager::info() const
 std::unique_ptr<IMemoryPool> OffsetLifetimeManager::create_pool(IAllocator *allocator)
 {
     ARM_COMPUTE_ERROR_ON(allocator == nullptr);
-    return support::cpp14::make_unique<OffsetMemoryPool>(allocator, _blob);
+    return std::make_unique<OffsetMemoryPool>(allocator, _blob);
 }
 
 MappingType OffsetLifetimeManager::mapping_type() const
@@ -72,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings()
 
     // Update blob size
     size_t max_aggregated_size = 0;
-    std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
-    {
-        max_aggregated_size += b.max_size;
-        _blob.alignment = std::max(_blob.alignment, b.max_alignment);
-    });
+    std::for_each(std::begin(_free_blobs), std::end(_free_blobs),
+                  [&](const Blob &b)
+                  {
+                      max_aggregated_size += b.max_size;
+                      _blob.alignment = std::max(_blob.alignment, b.max_alignment);
+                  });
     max_aggregated_size += _free_blobs.size() * _blob.alignment;
     _blob.owners = std::max(_blob.owners, _free_blobs.size());
     _blob.size   = std::max(_blob.size, max_aggregated_size);
 
     // Calculate group mappings
-    auto &group_mappings = _active_group->mappings();
+    auto  &group_mappings = _active_group->mappings();
     size_t offset         = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];
diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index c8381a11d2..8f3c1a84ba 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <algorithm>
-
 #include "arm_compute/runtime/OffsetMemoryPool.h"
 
 #include "arm_compute/core/Error.h"
@@ -30,7 +28,8 @@
 #include "arm_compute/runtime/IMemoryPool.h"
 #include "arm_compute/runtime/MemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
-#include "support/MemorySupport.h"
+
+#include <algorithm>
 
 namespace arm_compute
 {
@@ -51,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
     ARM_COMPUTE_ERROR_ON(_blob == nullptr);
 
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
@@ -60,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
 
 void OffsetMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);
@@ -75,6 +74,6 @@ MappingType OffsetMemoryPool::mapping_type() const
 std::unique_ptr<IMemoryPool> OffsetMemoryPool::duplicate()
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
-    return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_info);
+    return std::make_unique<OffsetMemoryPool>(_allocator, _blob_info);
 }
 } // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp b/src/runtime/OperatorTensor.cpp
index 4f5ee28a76..19415b35cf 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp
+++ b/src/runtime/OperatorTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,25 +21,40 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
+#include "arm_compute/runtime/OperatorTensor.h"
 
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/MemoryRegion.h"
+
+#include "support/Cast.h"
 
 namespace arm_compute
 {
-GCActivationLayer::GCActivationLayer(GCRuntimeContext *ctx)
-    : IGCSimpleFunction(ctx)
+namespace experimental
+{
+OperatorTensor::OperatorTensor(ITensorInfo *info, IMemory *memory)
+    : _info(info), _memory(memory), _mem_type(MemoryType::CPU)
 {
 }
 
-void GCActivationLayer::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info)
+ITensorInfo *OperatorTensor::info() const
 {
-    auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr;
+    return _info;
+}
 
-    auto k = arm_compute::support::cpp14::make_unique<GCActivationLayerKernel>(core_ctx);
-    k->configure(input, output, act_info);
-    _kernel = std::move(k);
+ITensorInfo *OperatorTensor::info()
+{
+    return _info;
+}
+
+uint8_t *OperatorTensor::buffer() const
+{
+    switch (_mem_type)
+    {
+        case MemoryType::CPU:
+            return (uint8_t *)utils::cast::polymorphic_downcast<MemoryRegion *>(_memory->region())->buffer();
+        default:
+            ARM_COMPUTE_ERROR("Memory type not supported.");
+    }
 }
+} // namespace experimental
 } // namespace arm_compute
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 455f969bd3..7fb9bd8000 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,13 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IMemoryPool.h"
-#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <list>
 
 using namespace arm_compute;
 
-PoolManager::PoolManager()
-    : _free_pools(), _occupied_pools(), _sem(), _mtx()
+PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx()
 {
 }
 
@@ -53,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool)
     ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
 
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
-    {
-        return pool_it.get() == pool;
-    });
+    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools),
+                           [pool](const std::unique_ptr<IMemoryPool> &pool_it) { return pool_it.get() == pool; });
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
     _free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
     _sem->signal();
@@ -71,7 +67,7 @@ void PoolManager::register_pool(std::unique_ptr<IMemoryPool> pool)
     _free_pools.push_front(std::move(pool));
 
     // Update semaphore
-    _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
+    _sem = std::make_unique<arm_compute::Semaphore>(_free_pools.size());
 }
 
 std::unique_ptr<IMemoryPool> PoolManager::release_pool()
@@ -79,14 +75,14 @@ std::unique_ptr<IMemoryPool> PoolManager::release_pool()
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
     ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!");
 
-    if(!_free_pools.empty())
+    if (!_free_pools.empty())
     {
         std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front());
         ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr);
         _free_pools.pop_front();
 
         // Update semaphore
-        _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
+        _sem = std::make_unique<arm_compute::Semaphore>(_free_pools.size());
 
         return pool;
     }
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
deleted file mode 100644
index 16a91a8704..0000000000
--- a/src/runtime/Pyramid.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/Pyramid.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-
-#include <cmath>
-
-using namespace arm_compute;
-
-void Pyramid::init(const PyramidInfo &info)
-{
-    internal_init(info, false);
-}
-
-void Pyramid::init_auto_padding(const PyramidInfo &info)
-{
-    internal_init(info, true);
-}
-
-void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
-{
-    _info = info;
-    _pyramid.resize(_info.num_levels());
-
-    size_t      w            = _info.width();
-    size_t      h            = _info.height();
-    size_t      ref_w        = w;
-    size_t      ref_h        = h;
-    bool        is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
-    TensorShape tensor_shape = _info.tensor_shape();
-
-    // Note: Look-up table used by the OpenVX sample implementation
-    const std::array<float, 4> c_orbscale = { 0.5f,
-                                              SCALE_PYRAMID_ORB,
-                                              SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
-                                              SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
-                                            };
-
-    for(size_t i = 0; i < _info.num_levels(); ++i)
-    {
-        TensorInfo tensor_info(tensor_shape, _info.format());
-
-        if(auto_padding)
-        {
-            tensor_info.auto_padding();
-        }
-
-        _pyramid[i].allocator()->init(tensor_info);
-
-        if(is_orb_scale)
-        {
-            float orb_scale = c_orbscale[(i + 1) % 4];
-            w               = static_cast<int>(std::ceil(static_cast<float>(ref_w) * orb_scale));
-            h               = static_cast<int>(std::ceil(static_cast<float>(ref_h) * orb_scale));
-
-            if(0 == ((i + 1) % 4))
-            {
-                ref_w = w;
-                ref_h = h;
-            }
-        }
-        else
-        {
-            w = (w + 1) * _info.scale();
-            h = (h + 1) * _info.scale();
-        }
-
-        // Update tensor_shape
-        tensor_shape.set(0, w);
-        tensor_shape.set(1, h);
-    }
-}
-
-void Pyramid::allocate()
-{
-    for(size_t i = 0; i < _info.num_levels(); ++i)
-    {
-        _pyramid[i].allocator()->allocate();
-    }
-}
-
-const PyramidInfo *Pyramid::info() const
-{
-    return &_info;
-}
-
-Tensor *Pyramid::get_pyramid_level(size_t index) const
-{
-    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
-
-    return &_pyramid[index];
-}
diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp
index 308e2788a9..1de8d2abdb 100644
--- a/src/runtime/RuntimeContext.cpp
+++ b/src/runtime/RuntimeContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,7 @@
 
 namespace arm_compute
 {
-RuntimeContext::RuntimeContext()
-    : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get()), _device_props()
+RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
 {
 }
 
@@ -48,9 +47,4 @@ IAssetManager *RuntimeContext::asset_manager()
 {
     return nullptr;
 }
-
-const DeviceProperties &RuntimeContext::properties()
-{
-    return _device_props;
-}
 } // namespace arm_compute
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 380ad90a27..3f1e96968a 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/Scheduler.h"
 
 #include "arm_compute/core/Error.h"
-#include "support/MemorySupport.h"
 
 #if ARM_COMPUTE_CPP_SCHEDULER
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
@@ -55,19 +54,19 @@ namespace
 std::map<Scheduler::Type, std::unique_ptr<IScheduler>> init()
 {
     std::map<Scheduler::Type, std::unique_ptr<IScheduler>> m;
-    m[Scheduler::Type::ST] = support::cpp14::make_unique<SingleThreadScheduler>();
+    m[Scheduler::Type::ST] = std::make_unique<SingleThreadScheduler>();
 #if defined(ARM_COMPUTE_CPP_SCHEDULER)
-    m[Scheduler::Type::CPP] = support::cpp14::make_unique<CPPScheduler>();
+    m[Scheduler::Type::CPP] = std::make_unique<CPPScheduler>();
 #endif // defined(ARM_COMPUTE_CPP_SCHEDULER)
 #if defined(ARM_COMPUTE_OPENMP_SCHEDULER)
-    m[Scheduler::Type::OMP] = support::cpp14::make_unique<OMPScheduler>();
+    m[Scheduler::Type::OMP] = std::make_unique<OMPScheduler>();
 #endif // defined(ARM_COMPUTE_OPENMP_SCHEDULER)
 
     return m;
 }
 } // namespace
 
-std::map<Scheduler::Type, std::unique_ptr<IScheduler>> Scheduler::_schedulers = init();
+std::map<Scheduler::Type, std::unique_ptr<IScheduler>> Scheduler::_schedulers{};
 
 void Scheduler::set(Type t)
 {
@@ -77,7 +76,7 @@ void Scheduler::set(Type t)
 
 bool Scheduler::is_available(Type t)
 {
-    if(t == Type::CUSTOM)
+    if (t == Type::CUSTOM)
     {
         return _custom_scheduler != nullptr;
     }
@@ -94,11 +93,12 @@ Scheduler::Type Scheduler::get_type()
 
 IScheduler &Scheduler::get()
 {
-    if(_scheduler_type == Type::CUSTOM)
+    if (_scheduler_type == Type::CUSTOM)
     {
-        if(_custom_scheduler == nullptr)
+        if (_custom_scheduler == nullptr)
         {
-            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) "
+                              "before Scheduler::get()");
         }
         else
         {
@@ -107,8 +107,13 @@ IScheduler &Scheduler::get()
     }
     else
     {
+        if (_schedulers.empty())
+        {
+            _schedulers = init();
+        }
+
         auto it = _schedulers.find(_scheduler_type);
-        if(it != _schedulers.end())
+        if (it != _schedulers.end())
         {
             return *it->second;
         }
diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp
index c6c90348b4..4fb08d79f5 100644
--- a/src/runtime/SchedulerFactory.cpp
+++ b/src/runtime/SchedulerFactory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,6 @@
  */
 #include "arm_compute/runtime/SchedulerFactory.h"
 
-#include "support/MemorySupport.h"
-
 #include "arm_compute/core/Error.h"
 #if ARM_COMPUTE_CPP_SCHEDULER
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
@@ -50,16 +48,16 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory:
 
 std::unique_ptr<IScheduler> SchedulerFactory::create(Type type)
 {
-    switch(type)
+    switch (type)
     {
         case Type::ST:
         {
-            return support::cpp14::make_unique<SingleThreadScheduler>();
+            return std::make_unique<SingleThreadScheduler>();
         }
         case Type::CPP:
         {
 #if ARM_COMPUTE_CPP_SCHEDULER
-            return support::cpp14::make_unique<CPPScheduler>();
+            return std::make_unique<CPPScheduler>();
 #else  /* ARM_COMPUTE_CPP_SCHEDULER */
             ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
 #endif /* ARM_COMPUTE_CPP_SCHEDULER */
@@ -67,7 +65,7 @@ std::unique_ptr<IScheduler> SchedulerFactory::create(Type type)
         case Type::OMP:
         {
 #if ARM_COMPUTE_OPENMP_SCHEDULER
-            return support::cpp14::make_unique<OMPScheduler>();
+            return std::make_unique<OMPScheduler>();
 #else  /* ARM_COMPUTE_OPENMP_SCHEDULER */
             ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
 #endif /* ARM_COMPUTE_OPENMP_SCHEDULER */
diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
new file mode 100644
index 0000000000..74ee539fec
--- /dev/null
+++ b/src/runtime/SchedulerUtils.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/SchedulerUtils.h"
+
+#include "arm_compute/core/Error.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace scheduler_utils
+{
+#ifndef BARE_METAL
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
+{
+    /*
+     * We want the same ratio of threads in M & N to the ratio of m and n problem size
+     *
+     * Therefore:    mt/nt == m/n    where mt*nt == max_threads
+     *
+     *             max_threads/nt = mt    &    (max_threads/nt) * (m/n) = nt
+     *          nt^2 = max_threads * (m/n)
+     *          nt = sqrt( max_threads * (m/n) )
+     */
+    //ratio of m to n in problem dimensions
+    double ratio = m / static_cast<double>(n);
+
+    // nt = sqrt(max_threads * (m / n) )
+    const unsigned adjusted = std::round(std::sqrt(max_threads * ratio));
+
+    //find the nearest factor of max_threads
+    for (unsigned i = 0; i != adjusted; ++i)
+    {
+        //try down
+        const unsigned adj_down = adjusted - i;
+        if (max_threads % adj_down == 0)
+        {
+            return {adj_down, max_threads / adj_down};
+        }
+
+        //try up
+        const unsigned adj_up = adjusted + i;
+        if (max_threads % adj_up == 0)
+        {
+            return {adj_up, max_threads / adj_up};
+        }
+    }
+
+    //we didn't find anything so lets bail out with maxes biased to the largest dimension
+    if (m > n)
+    {
+        return {std::min<unsigned>(m, max_threads), 1};
+    }
+    else
+    {
+        return {1, std::min<unsigned>(n, max_threads)};
+    }
+}
+#endif /* #ifndef BARE_METAL */
+} // namespace scheduler_utils
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECol2Im.cpp b/src/runtime/SchedulerUtils.h
index 262ba8f2af..46644a369e 100644
--- a/src/runtime/NEON/functions/NECol2Im.cpp
+++ b/src/runtime/SchedulerUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,22 +21,25 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
+#ifndef SRC_COMPUTE_SCHEDULER_UTILS_H
+#define SRC_COMPUTE_SCHEDULER_UTILS_H
 
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "support/MemorySupport.h"
+#include <cstddef>
+#include <utility>
 
 namespace arm_compute
 {
-void NECol2Im::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
+namespace scheduler_utils
 {
-    auto k = arm_compute::support::cpp14::make_unique<NECol2ImKernel>();
-    k->configure(input, output, convolved_dims);
-    _kernel = std::move(k);
-}
-
-Status NECol2Im::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
-{
-    return NECol2ImKernel::validate(input, output, convolved_dims);
-}
+/** Given two dimensions and a maximum number of threads to utilise, calculate the best
+ * combination of threads that fit in (multiplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n);
+} // namespace scheduler_utils
 } // namespace arm_compute
+#endif /* SRC_COMPUTE_SCHEDULER_UTILS_H */
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index b010a32eca..f87256abb1 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-SubTensor::SubTensor()
-    : _parent(nullptr), _info()
+SubTensor::SubTensor() : _parent(nullptr), _info()
 {
 }
 
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 8f7ecd6ffa..f17e323694 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,7 @@
 
 namespace arm_compute
 {
-Tensor::Tensor(IRuntimeContext *)
-    : _allocator(this)
+Tensor::Tensor(IRuntimeContext *) : _allocator(this)
 {
 }
 
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index ffd5cc7236..372852bfea 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,6 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/MemoryRegion.h"
-#include "support/MemorySupport.h"
 
 #include <cstddef>
 
@@ -44,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
     const size_t       parent_dims  = parent_info.num_dimensions();
     const size_t       child_dims   = child_info.num_dimensions();
 
-    if(child_dims <= parent_dims)
+    if (child_dims <= parent_dims)
     {
-        for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+        for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
         {
             const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
 
-            if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+            if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
             {
                 is_valid = false;
                 break;
@@ -66,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
 }
 } // namespace
 
-TensorAllocator::TensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory()
+TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory()
 {
 }
 
@@ -89,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
 
 TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
 {
-    if(&o != this)
+    if (&o != this)
     {
         _owner   = o._owner;
         o._owner = nullptr;
@@ -118,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &
     _memory = Memory(allocator._memory.region());
 
     // Init tensor info with new dimensions
-    size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
-    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+    size_t total_size =
+        parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(),
+                  parent_info.offset_element_in_bytes(coords), total_size);
 
     // Set TensorInfo
     init(sub_info);
@@ -134,9 +134,9 @@ void TensorAllocator::allocate()
 {
     // Align to 64-byte boundaries by default if alignment is not specified
     const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64;
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
-        _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(info().total_size(), alignment_to_use));
+        _memory.set_owned_region(std::make_unique<MemoryRegion>(info().total_size(), alignment_to_use));
     }
     else
     {
@@ -157,7 +157,7 @@ Status TensorAllocator::import_memory(void *memory)
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(alignment() != 0 && !arm_compute::utility::check_aligned(memory, alignment()));
 
-    _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(memory, info().total_size()));
+    _memory.set_owned_region(std::make_unique<MemoryRegion>(memory, info().total_size()));
     info().set_is_resizable(false);
 
     return Status{};
diff --git a/src/runtime/TracePoint.cpp b/src/runtime/TracePoint.cpp
deleted file mode 100644
index 817d63bdf3..0000000000
--- a/src/runtime/TracePoint.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/TracePoint.h"
-#include <stdio.h>
-#include <vector>
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-TRACE_TO_STRING(KeyPointArray)
-TRACE_TO_STRING(Pyramid)
-TRACE_TO_STRING(LSTMParams<ITensor>)
-TRACE_TO_STRING(FullyConnectedLayerInfo)
-TRACE_TO_STRING(arm_gemm::Requantize32)
-
-CONST_PTR_CLASS(KeyPointArray)
-CONST_PTR_CLASS(Pyramid)
-CONST_PTR_CLASS(LSTMParams<ITensor>)
-CONST_PTR_CLASS(DetectionPostProcessLayerInfo)
-CONST_PTR_CLASS(FullyConnectedLayerInfo)
-CONST_PTR_CLASS(GenerateProposalsInfo)
-CONST_PTR_CLASS(arm_gemm::Requantize32)
-} // namespace arm_compute
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 2204ec11d7..a7f7b5f3cb 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/Utils.h"
+#include "src/runtime/Utils.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -31,6 +31,8 @@
 
 namespace arm_compute
 {
+namespace utils
+{
 #ifndef DOXYGEN_SKIP_THIS
 static const std::string information =
 #include "arm_compute_version.embed"
@@ -39,20 +41,17 @@ static const std::string information =
 
 const std::string &string_from_scheduler_type(Scheduler::Type t)
 {
-    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
-    {
-        { Scheduler::Type::ST, "Single Thread" },
-        { Scheduler::Type::CPP, "C++11 Threads" },
-        { Scheduler::Type::OMP, "OpenMP Threads" },
-        { Scheduler::Type::CUSTOM, "Custom" }
-    };
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"},
+                                                                              {Scheduler::Type::CPP, "C++11 Threads"},
+                                                                              {Scheduler::Type::OMP, "OpenMP Threads"},
+                                                                              {Scheduler::Type::CUSTOM, "Custom"}};
 
     return scheduler_type_map[t];
 }
 
 void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints)
 {
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr);
         ctx->scheduler()->schedule(kernel, hints);
@@ -66,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch
 unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis)
 {
     // We need only 1 stage for all axis except x-axis
-    if(axis != 0)
+    if (axis != 0)
     {
         return 1;
     }
@@ -78,4 +77,5 @@ unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, un
     const unsigned int num_of_stages = num_of_wg / 128 + 2;
     return num_of_stages;
 }
+} // namespace utils
 } // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/Utils.h
index e05eb4c4ae..f8775c9612 100644
--- a/src/runtime/GLES_COMPUTE/GCTensor.cpp
+++ b/src/runtime/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,63 +21,40 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef SRC_RUNTIME_UTILS_H
+#define SRC_RUNTIME_UTILS_H
 
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Scheduler.h"
 
-namespace arm_compute
-{
-GCTensor::GCTensor(IRuntimeContext *)
-    : _allocator(this)
-{
-}
-
-ITensorAllocator *GCTensor::allocator()
-{
-    return &_allocator;
-}
-
-TensorInfo *GCTensor::info() const
-{
-    return &_allocator.info();
-}
-
-TensorInfo *GCTensor::info()
-{
-    return &_allocator.info();
-}
-
-uint8_t *GCTensor::buffer() const
-{
-    return _allocator.data();
-}
-
-GLuint GCTensor::gc_buffer() const
-{
-    return _allocator.get_gl_ssbo_name();
-}
+#include <string>
 
-void GCTensor::associate_memory_group(arm_compute::IMemoryGroup *memory_group)
-{
-    _allocator.set_associated_memory_group(memory_group);
-}
-
-void GCTensor::map(bool blocking)
+namespace arm_compute
 {
-    IGCTensor::map(blocking);
-}
-
-void GCTensor::unmap()
+namespace utils
 {
-    IGCTensor::unmap();
-}
+/** Convert a Scheduler::Type into a string.
+ *
+ * @param[in] t @ref Scheduler::Type to be translated to string.
+ *
+ * @return The string describing the scheduler type.
+ */
+const std::string &string_from_scheduler_type(Scheduler::Type t);
 
-uint8_t *GCTensor::do_map(bool blocking)
-{
-    return _allocator.map(blocking);
-}
+/** Schedules a kernel using the context if not nullptr else uses the legacy scheduling flow.
+ *
+ * @param[in] ctx    Context to use.
+ * @param[in] kernel Kernel to schedule.
+ * @param[in] hints  Hints to use.
+ */
+void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints);
 
-void GCTensor::do_unmap()
-{
-    _allocator.unmap();
-}
+/** Calculate number of stages for parallel implementations
+ *
+ * @param[in] input_x_dimension input tensor x dimension
+ * @param[in] axis              axis to be used
+ */
+unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis);
+} // namespace utils
 } // namespace arm_compute
+#endif /* SRC_RUNTIME_UTILS_H */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..aba32871d0
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
+{
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
+        &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClDirectConvDefaultConfigBifrost::configure_default_f32,
+        &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G71:
+            func = configs_G71.get_function(src->data_type());
+            break;
+        default:
+            func = configs_default.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution");
+    return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 2;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 16;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 2;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = export_to_cl_image(wei);
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 8;
+
+        desc.export_weights_to_cl_image = export_to_cl_image(wei);
+    }
+
+    return desc;
+}
+} // namespace cl_direct_conv
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
new file mode 100644
index 0000000000..ed6a4c3c68
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Bifrost based OpenCL direct convolution configuration */
+class ClDirectConvDefaultConfigBifrost final : public IClDirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDirectConvDefaultConfigBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+    DirectConvComputeKernelInfo
+    configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..4b7666d5aa
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
+{
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(
+        &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G57:
+            func = configs_G57.get_function(src->data_type());
+            break;
+        case GPUTarget::G78:
+        default:
+            func = configs_G78.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution");
+    return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm          = dst_shape[0];
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            if (is_pointwise)
+            {
+                if (ofm == 4)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 4;
+                    desc.k0 = 16;
+                }
+                else
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                    desc.k0 = 16;
+                }
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = 2;
+                desc.k0 = 16;
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                desc.m0 = 1;
+                desc.n0 = 1;
+                desc.k0 = 16;
+            }
+            else
+            {
+                desc.m0 = 4;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm          = dst_shape[0];
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const int32_t k            = wei_shape[0];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            // k0 should be as larger as possible. However, we should avoid
+            // having left-over for loops that make the implementation slower.
+            if ((k % 16) == 0)
+            {
+                desc.k0 = 16;
+            }
+            else if ((k % 8) == 0)
+            {
+                desc.k0 = 8;
+            }
+            else
+            {
+                desc.k0 = 4;
+            }
+
+            if (is_pointwise)
+            {
+                if (ofm == 4)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 4;
+                }
+                else
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                }
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = dst_shape[0];
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                desc.m0 = 1;
+                desc.n0 = 1;
+                if ((k % 16) == 0)
+                {
+                    desc.k0 = 16;
+                }
+                else if ((k % 8) == 0)
+                {
+                    desc.k0 = 8;
+                }
+                else
+                {
+                    desc.k0 = 4;
+                }
+            }
+            else
+            {
+                if (ofm >= 16)
+                {
+                    if (m / 6 > 24000)
+                    {
+                        desc.m0 = 6;
+                    }
+                    else
+                    {
+                        desc.m0 = 5;
+                    }
+                    desc.n0 = 8;
+                    desc.k0 = 4;
+                }
+                else
+                {
+                    desc.m0 = 2;
+                    desc.n0 = 8;
+                    if ((k % 16) == 0)
+                    {
+                        desc.k0 = 16;
+                    }
+                    else if ((k % 8) == 0)
+                    {
+                        desc.k0 = 8;
+                    }
+                    else
+                    {
+                        desc.k0 = 4;
+                    }
+                }
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+
+        desc.n0 = 4;
+
+        if (output_shape[0] > 16)
+        {
+            desc.m0 = 4;
+        }
+
+        desc.k0 = 16;
+
+        desc.export_weights_to_cl_image = false;
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            if (is_pointwise)
+            {
+                desc.m0 = 1;
+                desc.n0 = 1;
+                desc.k0 = 16;
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = dst_shape[0];
+                desc.k0 = 16;
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                if (m == 1)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                    desc.k0 = 16;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                    desc.n0 = 2;
+                    desc.k0 = 8;
+                }
+            }
+            else
+            {
+                desc.m0 = 4;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Get the output shape
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm          = dst_shape[0];
+        const int32_t m            = dst_shape[1] * dst_shape[2];
+        const bool    is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1;
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (dst_shape[0] <= 4)
+        {
+            if (is_pointwise)
+            {
+                desc.m0 = 2;
+                desc.n0 = 1;
+                desc.k0 = 16;
+            }
+            else
+            {
+                desc.m0 = 1;
+                desc.n0 = dst_shape[0];
+                desc.k0 = 16;
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                if (m == 1)
+                {
+                    desc.m0 = 1;
+                    desc.n0 = 1;
+                    desc.k0 = 16;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                    desc.n0 = 2;
+                    desc.k0 = 8;
+                }
+            }
+            else
+            {
+                if (ofm > 16)
+                {
+                    desc.m0 = 4;
+                    desc.n0 = 8;
+                    desc.k0 = 8;
+                }
+                else
+                {
+                    desc.m0 = 8;
+                    desc.n0 = 4;
+                    desc.k0 = 4;
+                }
+            }
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_direct_conv
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
new file mode 100644
index 0000000000..efd879a567
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Valhall based OpenCL direct convolution configuration */
+class ClDirectConvDefaultConfigValhall final : public IClDirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDirectConvDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+    DirectConvComputeKernelInfo
+    configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
new file mode 100644
index 0000000000..215b17ef79
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
+
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** ClDirectConvolution factory class */
+class ClDirectConvKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClDirectConvolution kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClDirectConvKernelConfig
+     */
+    static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71);
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClDirectConvDefaultConfigBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClDirectConvDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
new file mode 100644
index 0000000000..e5b270c720
--- /dev/null
+++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_direct_conv
+{
+/** Basic container for the OpenCL direct convolution configuration functions */
+template <class T>
+class ClDirectConvConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for direct convolution F32
+     * @param[in] func_f16  Function to call for direct convolution F16
+     * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+    {
+    }
+
+    /** Method to return the direct convolution configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the Direct convolution kernel configuration */
+class IClDirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClDirectConvKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClDirectConvKernelConfig() = default;
+    /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs
+     *
+     * @param[in] src       Source tensor (activation tensor)
+     * @param[in] wei       Weights tensor
+     * @param[in] conv_info Convolution info
+     */
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_direct_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..98ebf3ebbe
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+namespace
+{
+DWCComputeKernelInfo configure_f32(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image = false;
+
+        if (is_g71)
+        {
+            desc.export_weights_to_cl_image = false;
+        }
+        else
+        {
+            desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+        }
+
+        if (depth_multiplier == 1)
+        {
+            desc.n0 = 4;
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo configure_f16(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Src and weights have the same dimension indices
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape src_shape = src->tensor_shape();
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      src_w     = src_shape[idx_w];
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image = false;
+
+        if (is_g71)
+        {
+            desc.export_weights_to_cl_image = false;
+        }
+        else
+        {
+            desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+        }
+
+        if (depth_multiplier == 1)
+        {
+            if (desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                if ((src_w % 5) == 0)
+                {
+                    desc.m0 = 5;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                }
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace
+
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
+{
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G71:
+            func = configs_G71.get_function(src->data_type());
+            break;
+        default:
+            func = configs_G7x.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+    return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(wei);
+
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = false;
+        desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        {
+            desc.m0 = 2;
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
new file mode 100644
index 0000000000..41d86c9c14
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Bifrost based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
+
+private:
+    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..ef1bb3858c
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
+{
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G77:
+            func = configs_G77.get_function(src->data_type());
+            break;
+        case GPUTarget::G78:
+        default:
+            func = configs_G78.get_function(src->data_type());
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+    return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if (depth_multiplier == 1)
+        {
+            desc.n0 = 4;
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        // Src and weights have the same dimension indices
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape src_shape = src->tensor_shape();
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      src_w     = src_shape[idx_w];
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if (depth_multiplier == 1)
+        {
+            if (desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                if ((src_w % 5) == 0)
+                {
+                    desc.m0 = 5;
+                }
+                else
+                {
+                    desc.m0 = 4;
+                }
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(wei);
+
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = false;
+        desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        {
+            desc.m0 = 2;
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
+{
+    DWCComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const size_t      kernel_c  = wei_shape[idx_c];
+        const size_t      kernel_w  = wei_shape[idx_w];
+
+        desc.export_input_to_cl_image   = false;
+        desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+        if (depth_multiplier == 1)
+        {
+            if (desc.export_weights_to_cl_image == false)
+            {
+                desc.n0 = 8;
+            }
+            else
+            {
+                desc.n0 = 4;
+            }
+        }
+        else
+        {
+            if ((depth_multiplier % 4) == 0)
+            {
+                desc.n0 = 4;
+            }
+            else if ((depth_multiplier % 2) == 0)
+            {
+                desc.n0 = 2;
+            }
+            else
+            {
+                desc.n0 = 1;
+            }
+        }
+
+        // Note: If we reduce n0, export to cl_image must be false
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
+
+        desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+        // Set m0 only if stride_x == 1 and dilation_x == 1
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
+        {
+            if ((kernel_w >= 9) || (kernel_w == 1))
+            {
+                desc.m0 = 1;
+            }
+            else
+            {
+                desc.m0 = 2;
+            }
+        }
+        else
+        {
+            desc.m0 = 1;
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..fabce77b54
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Valhall based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
+
+private:
+    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
index cae3123d74..c8b006c546 100644
--- a/src/runtime/CL/tuners/MidgardTuner.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,57 +21,41 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/tuners/MidgardTuner.h"
-
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
 
 namespace arm_compute
 {
-namespace tuners
-{
-namespace
+namespace cl_dwc
 {
-void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k)
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
 {
-    cl::NDRange     lws_hint   = k.lws_hint();
-    const GPUTarget gpu_target = k.get_target();
-
-    switch(gpu_target)
+    // Check whether we can use the cl image with the weights.
+    if (!export_to_cl_image(weights))
     {
-        case GPUTarget::MIDGARD:
-        case GPUTarget::T600:
-        case GPUTarget::T700:
-        case GPUTarget::T800:
-            if(k._output->info()->dimension(1) == 196)
-            {
-                lws_hint = cl::NDRange(1, 7);
-            }
-            else
-            {
-                lws_hint = cl::NDRange(8, 8);
-            }
-            break;
-        default:
-            lws_hint = cl::NullRange;
+        return false;
     }
 
-    k.set_lws_hint(lws_hint);
-}
-} // namespace
+    const size_t idx_w    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h    = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t kernel_w = weights->tensor_shape()[idx_w];
+    const size_t kernel_h = weights->tensor_shape()[idx_h];
 
-void MidgardTuner::tune_kernel_static(ICLKernel &kernel)
-{
-    if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr)
+    // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
+    // 1- When the kernel size is 1x1
+    // 2- When the depth multiplier is greater than 1 and not multiple of 4.
+    if ((kernel_w == 1) && (kernel_h == 1))
     {
-        tune_gemm_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel *>(&kernel));
+        return false;
     }
-}
 
-void MidgardTuner::tune_kernel_dynamic(ICLKernel &kernel)
-{
-    ARM_COMPUTE_UNUSED(kernel);
+    if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+    {
+        return false;
+    }
+
+    return true;
 }
-} // namespace tuners
+} // namespace cl_dwc
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEYOLOLayer.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
index cef6246f51..e3484c04ff 100644
--- a/src/runtime/NEON/functions/NEYOLOLayer.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,22 +21,25 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
-
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
-#include "support/MemorySupport.h"
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
 
 namespace arm_compute
 {
-void NEYOLOLayer::configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEYOLOLayerKernel>();
-    k->configure(input, output, act_info, num_classes);
-    _kernel = std::move(k);
-}
+// Forward declaration
+class ITensorInfo;
 
-Status NEYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+namespace cl_dwc
 {
-    return NEYOLOLayerKernel::validate(input, output, act_info, num_classes);
-}
+/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance
+ *
+ * @param[in] weights          Weights TensorInfo of the depthwise convolution
+ * @param[in] depth_multiplier Depth multiplier
+ *
+ * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance
+ */
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier);
+
+} // namespace cl_dwc
 } // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..031cf1859a
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** ClDWCNativeKernelConfigurationFactory factory class */
+class ClDWCNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClDWCNativeKernelConfig
+     */
+    static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                // The heuristic for Midgard is the same as the one used for Arm Mali-G71
+                return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71);
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..614a6622df
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Basic container for the OpenCL depthwise convolution configuration functions */
+template <class T>
+class ClDWCNativeConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for depthwise convolution F32
+     * @param[in] func_f16  Function to call for depthwise convolution F16
+     * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+    {
+    }
+
+    /** Method to return the depthwise convolution configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the depthwise convolution kernel configuration */
+class IClDWCNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClDWCNativeKernelConfig() = default;
+    /** This method returns the @ref DWCComputeKernelInfo for the given inputs
+     *
+     * @param[in] src              Source tensor (activation tensor)
+     * @param[in] wei              Weights tensor
+     * @param[in] conv_info        Convolution info
+     * @param[in] dilation         Kernel dilation
+     * @param[in] depth_multiplier Output feature maps multiplier
+     */
+    virtual DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..3380d8f1b7
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu)
+{
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                          const ITensorInfo   *wei,
+                                                                          const PadStrideInfo &conv_info)
+{
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+
+    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
+
+    // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
+    // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
+    // for the pointwise convolution cases.
+
+    ConfigurationFunctionExecutorPtr func = configs_G77.get_function(src->data_type());
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for indirect convolution");
+    return (this->*func)(src, wei, conv_info);
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+        const int32_t     stride_x                   = conv_info.stride().first;
+        const int32_t     stride_y                   = conv_info.stride().second;
+        const int32_t     ofm                        = dst_shape[0];
+        const int32_t     m                          = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y);
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (ofm <= 4)
+        {
+            desc.m0 = 1;
+            desc.n0 = 2;
+            desc.k0 = 16;
+        }
+        else
+        {
+            // The 16000 threshold value has been identified as the right
+            // one for using the biggest block size allowed on F32: 5x4x4
+            if (m < 16000)
+            {
+                desc.m0 = 4;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+            else
+            {
+                desc.m0 = 5;
+                desc.n0 = 4;
+                desc.k0 = 4;
+            }
+        }
+    }
+
+    return desc;
+}
+
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
+{
+    DirectConvComputeKernelInfo desc;
+
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const bool        export_weights_to_cl_image = export_to_cl_image(wei);
+
+        const int32_t ofm = dst_shape[0];
+        const int32_t m   = dst_shape[1] * dst_shape[2];
+        const int32_t k   = wei_shape[0];
+
+        desc.export_weights_to_cl_image = export_weights_to_cl_image;
+
+        if (ofm <= 4)
+        {
+            // k0 should be as larger as possible. However, we should avoid
+            // having left-over for loops that make the implementation slower.
+            if ((k % 16) == 0)
+            {
+                desc.k0 = 16;
+            }
+            else if ((k % 8) == 0)
+            {
+                desc.k0 = 8;
+            }
+            else
+            {
+                desc.k0 = 4;
+            }
+
+            desc.m0 = 1;
+            desc.n0 = ofm;
+        }
+        else
+        {
+            // The 16000 threshold value has been identified as the right
+            // one for using the biggest block size allowed on F16: 8x4
+            if (m >= 16000 && k < 4)
+            {
+                desc.m0 = 8;
+                desc.n0 = 4;
+                desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4
+            }
+            else
+            {
+                desc.m0 = 5;
+                desc.n0 = 4;
+                desc.k0 = 8;
+            }
+        }
+    }
+
+    return desc;
+}
+} // namespace cl_indirect_conv
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index 1c30af1b71..bab808c66c 100644
--- a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,47 +21,35 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h"
+#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL
 
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
 
 namespace arm_compute
 {
-GCRuntimeContext::GCRuntimeContext()
-    : _gpu_owned_scheduler(support::cpp14::make_unique<GCScheduler>()),
-      _gpu_scheduler(_gpu_owned_scheduler.get()),
-      _core_context()
+namespace cl_indirect_conv
 {
-    auto attrs   = create_opengl_display_and_context();
-    auto display = std::get<0>(attrs);
-    auto ctx     = std::get<1>(attrs);
-
-    _gpu_owned_scheduler->default_init_with_context(display, ctx);
-    _kernel_lib.init("./cs_shaders/", display, ctx);
-
-    _core_context = GCCoreRuntimeContext(&_kernel_lib);
-}
-
-GCKernelLibrary &GCRuntimeContext::kernel_library()
-{
-    return _kernel_lib;
-}
-
-GCCoreRuntimeContext *GCRuntimeContext::core_runtime_context()
-{
-    return &_core_context;
-}
-
-void GCRuntimeContext::set_gpu_scheduler(GCScheduler *scheduler)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(scheduler);
-    _gpu_scheduler = scheduler;
-}
-
-GCScheduler *GCRuntimeContext::gpu_scheduler()
+/** Valhall based OpenCL indirect convolution configuration */
+class ClIndirectConvDefaultConfigValhall final : public IClIndirectConvKernelConfig
 {
-    return _gpu_scheduler;
-}
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClIndirectConvDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+
+private:
+    DirectConvComputeKernelInfo
+    configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+};
+} // namespace cl_indirect_conv
 } // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
new file mode 100644
index 0000000000..5e7ba6f8e9
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
+
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+/** ClIndirectConvolution factory class */
+class ClIndirectConvKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClIndirectConvolution kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClIndirectConvKernelConfig
+     */
+    static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClIndirectConvDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_indirect_conv
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
new file mode 100644
index 0000000000..d05da18b58
--- /dev/null
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_indirect_conv
+{
+/** Basic container for the OpenCL indirect convolution configuration functions */
+template <class T>
+class ClIndirectConvConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+
+    /** Constructor
+     *
+     * @param[in] func_f32 Function to call for indirect convolution F32
+     * @param[in] func_f16 Function to call for indirect convolution F16
+     *
+     */
+    ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16}
+    {
+    }
+
+    /** Method to return the indirect convolution configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 2> _configs;
+};
+
+/** Basic interface for the indirect convolution kernel configuration */
+class IClIndirectConvKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClIndirectConvKernelConfig() = default;
+    /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs
+     *
+     * @param[in] src       Source tensor (activation tensor)
+     * @param[in] wei       Weights tensor
+     * @param[in] conv_info Convolution info
+     */
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_indirect_conv
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..3a02a60650
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu)
+{
+}
+
+MatMulKernelInfo
+ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
+{
+    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G715_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G715:
+        case GPUTarget::G615:
+            func = configs_G715.get_function(lhs->data_type());
+            break;
+        case GPUTarget::G710:
+        default:
+            func = configs_G710.get_function(lhs->data_type());
+            break;
+    }
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    TensorShape lhs_shape = lhs->tensor_shape();
+    TensorShape rhs_shape = rhs->tensor_shape();
+
+    const bool is_batched = lhs_shape.num_dimensions() > 2;
+
+    if (is_batched == true)
+    {
+        lhs_shape.collapse_from(2);
+    }
+
+    const unsigned int m = adj_lhs ? lhs_shape.x() : lhs_shape.y();
+    const unsigned int n = adj_rhs ? rhs_shape.y() : rhs_shape.x();
+    const unsigned int k = adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    const unsigned int b = lhs_shape.z();
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native");
+    return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+    return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    return configure_G715_f32(m, n, k, b, rhs_lock_padding, info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+    return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 4, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1},
+        {1568, 64, 40, 36, 2, 8, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1},  {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
+    const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
+
+    if ((adj_lhs == false) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
+    }
+    else if ((adj_lhs == false) && (adj_rhs == true))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
+    }
+    else if ((adj_lhs == true) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
+    }
+    else
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_t;
+    }
+
+    MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
+    MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
+
+    return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 16, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0},  {688, 92, 68, 32, 6, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0},
+        {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1},
+        {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0},   {688, 92, 68, 32, 6, 2, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0},
+        {1568, 64, 40, 36, 5, 4, 8, 0},  {2920, 64, 64, 24, 6, 2, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 2, 8, 1},  {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
+    const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
+
+    if ((adj_lhs == false) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
+    }
+    else if ((adj_lhs == false) && (adj_rhs == true))
+    {
+        configs_best_to_use     = &configs_mnkb_best_nt_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
+    }
+    else if ((adj_lhs == true) && (adj_rhs == false))
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_nt;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
+    }
+    else
+    {
+        configs_best_to_use     = &configs_mnkb_best_t_t;
+        configs_fallback_to_use = &configs_mnkb_fallback_t_t;
+    }
+
+    MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
+    MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
+
+    return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+    ARM_COMPUTE_UNUSED(rhs_lock_padding);
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0},  {688, 92, 68, 32, 2, 8, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0},  {688, 92, 68, 32, 4, 4, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0},  {2920, 64, 64, 24, 4, 4, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},   {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0},  {2920, 64, 64, 24, 4, 2, 16, 0}};
+
+    const bool adj_lhs = info.adj_lhs();
+    const bool adj_rhs = info.adj_rhs();
+
+    if ((adj_lhs == false) && (adj_rhs == false))
+    {
+        return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b);
+    }
+    else if ((adj_lhs == false) && (adj_rhs == true))
+    {
+        return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b);
+    }
+    else if ((adj_lhs == true) && (adj_rhs == false))
+    {
+        return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b);
+    }
+    else
+    {
+        return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b);
+    }
+}
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..5279871057
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Valhall based OpenCL matmul configuration */
+class ClMatMulNativeDefaultConfigValhall final : public IClMatMulNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClMatMulNativeDefaultConfigValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override;
+
+private:
+    MatMulKernelInfo configure_G710_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G715_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
new file mode 100644
index 0000000000..3878f698fd
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu)
+    : IClMatMulNativeKernelVariant(gpu)
+{
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo         *lhs,
+                                                                    const ITensorInfo         *rhs,
+                                                                    const MatMulInfo          &info,
+                                                                    const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(rhs);
+
+    using VariantFunctionExecutorPtr =
+        MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled);
+
+    ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_G715(
+        &ClMatMulNativeDefaultVariantValhall::configure_G715_float,
+        &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized);
+
+    ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_default(
+        &ClMatMulNativeDefaultVariantValhall::configure_default_float,
+        &ClMatMulNativeDefaultVariantValhall::configure_default_quantized);
+
+    VariantFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G715:
+        case GPUTarget::G615:
+            func = configs_G715.get_function(lhs->data_type());
+            break;
+        default:
+            func = configs_default.get_function(lhs->data_type());
+            break;
+    }
+
+    const int  k           = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+    const bool act_enabled = act_info.enabled();
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native");
+    return (this->*func)(k, act_enabled);
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled)
+{
+    // MMUL kernel works only when K is a multiple of 4
+    if (!act_enabled && k % 4 == 0)
+    {
+        return MatMulKernelType::NATIVE_MMUL_FP;
+    }
+
+    return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled)
+{
+    // MMUL kernel works only when K is a multiple of 16
+    if (!act_enabled && k % 16 == 0)
+    {
+        return MatMulKernelType::NATIVE_MMUL_QUANTIZED;
+    }
+
+    return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled)
+{
+    ARM_COMPUTE_UNUSED(k, act_enabled);
+
+    return MatMulKernelType::NATIVE_FP;
+}
+
+MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled)
+{
+    ARM_COMPUTE_UNUSED(k, act_enabled);
+
+    return MatMulKernelType::NATIVE_QUANTIZED;
+}
+
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
new file mode 100644
index 0000000000..a202676e98
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
+
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Valhall based OpenCL matmul configuration */
+class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClMatMulNativeDefaultVariantValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    MatMulKernelType select_kernel(const ITensorInfo         *lhs,
+                                   const ITensorInfo         *rhs,
+                                   const MatMulInfo          &info,
+                                   const ActivationLayerInfo &act_info) override;
+
+private:
+    MatMulKernelType configure_G715_float(int k, bool act_enabled);
+    MatMulKernelType configure_G715_quantized(int k, bool act_enabled);
+    MatMulKernelType configure_default_float(int k, bool act_enabled);
+    MatMulKernelType configure_default_quantized(int k, bool act_enabled);
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
new file mode 100644
index 0000000000..89cad30214
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+
+#include <limits>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
+                             const MatMulKernelInfo &info1,
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true,
+                             "The fallback MatMul configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs,
+                             "The MatMul configurations must have the same adj_lhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs,
+                             "The MatMul configurations must have the same adj_rhs value");
+
+    const bool adj_lhs = info0.adj_lhs;
+    const bool adj_rhs = info0.adj_rhs;
+
+    TensorInfo lhs_info =
+        !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
+    TensorInfo rhs_info =
+        !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
+    TensorInfo dst_info;
+
+    if (rhs_lock_padding == false)
+    {
+        if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
+        {
+            return info0;
+        }
+        else
+        {
+            return info1;
+        }
+    }
+    else
+    {
+        return info1;
+    }
+}
+
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b)
+{
+    size_t min_acc = std::numeric_limits<size_t>::max();
+    size_t min_idx = 0;
+
+    ARM_COMPUTE_ERROR_ON(configs.size() == 0);
+    const size_t num_rows = configs.size();
+    const size_t num_cols = configs[0].size();
+
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U,
+                             "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
+    ARM_COMPUTE_UNUSED(num_cols);
+
+    // Find nearest GeMM workload
+    // Note: the workload does not depend on the K dimension
+    for (size_t y = 0; y < num_rows; ++y)
+    {
+        size_t mc0 = static_cast<size_t>(configs[y][0]);
+        size_t nc0 = static_cast<size_t>(configs[y][1]);
+        size_t kc0 = static_cast<size_t>(configs[y][2]);
+        size_t bc0 = static_cast<size_t>(configs[y][3]);
+
+        size_t acc = 0;
+        acc += (m - mc0) * (m - mc0);
+        acc += (n - nc0) * (n - nc0);
+        acc += (k - kc0) * (k - kc0);
+        acc += (b - bc0) * (b - bc0);
+        acc = std::sqrt(acc);
+        if (acc < min_acc)
+        {
+            min_acc = acc;
+            min_idx = y;
+        }
+    }
+
+    // Get the configuration from the nearest GeMM shape
+    MatMulKernelInfo desc;
+    desc.adj_lhs                = adj_lhs;
+    desc.adj_rhs                = adj_rhs;
+    desc.m0                     = configs[min_idx][4];
+    desc.n0                     = configs[min_idx][5];
+    desc.k0                     = configs[min_idx][6];
+    desc.export_rhs_to_cl_image = configs[min_idx][7];
+
+    return desc;
+}
+} // namespace cl_matmul
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
new file mode 100644
index 0000000000..699f5fe8c1
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+// Forward declaration
+struct MatMulKernelInfo;
+
+namespace cl_matmul
+{
+using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>;
+
+/** This function accepts two MatMulKernelInfo objects where only the first can be with cl_image2d support enabled.
+ *  The aim of this function is to check whether the first MatMulKernelInfo object is valid. If not, the function will
+ *  return the second MatMulKernelInfo object. Otherwise, the first one.
+ *
+ * @param[in] info0            MatMulKernelInfo with cl_image2d support
+ * @param[in] info1            MatMulKernelInfo to fall-back if cl_image2d cannot be used
+ * @param[in] m                Number of rows (M) of the LHS matrix
+ * @param[in] n                Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k                Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b                Batch size
+ * @param[in] data_type        Data type
+ * @param[in] rhs_lock_padding Flag used to know whether the RHS paddings are locked
+ *
+ * @return @ref MatMulKernelInfo
+ */
+MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
+                             const MatMulKernelInfo &info1,
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding);
+
+/** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user
+ *
+ * @param[in] configs List of best configurations for a limited number of MatMul shapes
+ * @param[in] adj_lhs Adjoint LHS flag value
+ * @param[in] adj_rhs Adjoint RHS flag value
+ * @param[in] m       Number of rows (M) of the LHS matrix
+ * @param[in] n       Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k       Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b       Batch size
+ *
+ * @return @ref MatMulKernelInfo
+ */
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b);
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
new file mode 100644
index 0000000000..e7485bca81
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
+
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** ClMatMul configuration factory class */
+class ClMatMulNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the ClMatMul configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClMatMulNativeKernelConfig
+     */
+    static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClMatMulNativeDefaultConfigValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..c2895b8919
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
+
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+
+/** ClMatMul variant factory class */
+class ClMatMulNativeKernelVariantFactory final
+{
+public:
+    /** Static method to call the ClMatMul configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IClMatMulNativeKernelVariant
+     */
+    static std::unique_ptr<IClMatMulNativeKernelVariant> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClMatMulNativeDefaultVariantValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
new file mode 100644
index 0000000000..00ba3641d5
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+/** Basic container for the OpenCL MatMul Native configuration functions */
+template <class T>
+class ClMatMulNativeConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for matmul native F32
+     * @param[in] func_f16  Function to call for matmul native F16
+     * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+    {
+    }
+
+    /** Method to return the matmul native configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the matmul native kernel configuration
+ *  This is the base class that chooses architecture specific kernel configurations.
+*/
+class IClMatMulNativeKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClMatMulNativeKernelConfig() = default;
+    /** This method returns the @ref MatMulKernelInfo for the given inputs
+     *
+     * @param[in] lhs  LHS tensor
+     * @param[in] rhs  RHS tensor
+     * @param[in] info MatMul info
+     */
+    virtual MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
new file mode 100644
index 0000000000..eac41dd6a3
--- /dev/null
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H
+
+#include "arm_compute/core/CoreTypes.h" // DataType
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/core/common/Macros.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace cl_matmul
+{
+enum class MatMulKernelType
+{
+    /** Native matrix multiplication for FP types */
+    NATIVE_FP,
+
+    /** Native matrix multiplication for quantized types */
+    NATIVE_QUANTIZED,
+
+    /** Native matrix multiplication using MMUL extension for FP types */
+    NATIVE_MMUL_FP,
+
+    /** Native matrix multiplication using MMUL extension for Quantized types */
+    NATIVE_MMUL_QUANTIZED
+};
+
+/** Basic container for the OpenCL MatMul Native variant functions */
+template <class T>
+class ClMatMulNativeVariantArray
+{
+public:
+    /** Alias for Float index */
+    static constexpr size_t DT_FLOAT = 0;
+    /** Alias for Quantized type index */
+    static constexpr size_t DT_QUANTIZED = 1;
+
+    /** Constructor
+     *
+     * @param[in] func_float     Function to call for matmul native float (F32, F16)
+     * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized}
+    {
+    }
+
+    /** Method to return the matmul native variant function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+            case DataType::F16:
+                return _configs.at(DT_FLOAT);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_QUANTIZED);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 2> _configs;
+};
+
+/** Basic interface for the matmul native kernel variant
+ *  This is the base class that chooses architecture specific kernel variants.
+*/
+class IClMatMulNativeKernelVariant
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant);
+    /** Virtual destructor */
+    virtual ~IClMatMulNativeKernelVariant() = default;
+    /** This method returns the @ref MatMulKernelType for the given inputs
+     *
+     * @param[in] lhs      LHS tensor
+     * @param[in] rhs      RHS tensor
+     * @param[in] info     MatMul info
+     * @param[in] act_info Activation layer info
+     */
+    virtual MatMulKernelType select_kernel(const ITensorInfo         *lhs,
+                                           const ITensorInfo         *rhs,
+                                           const MatMulInfo          &info,
+                                           const ActivationLayerInfo &act_info) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace cl_matmul
+} // namespace arm_compute
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H