517 files changed, 4903 insertions, 51791 deletions
diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h
deleted file mode 100644
index 8a182c6eb4..0000000000
--- a/arm_compute/core/AccessWindowAutoPadding.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H
-#define ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class Window;
-class ITensorInfo;
-
-/** Dummy access window.
- *
- * This implementation always uses the auto padding of the tensor info and
- * never updates the window. The valid region is always set to cover the entire
- * tensor.
- *
- * @note This access window is only used during the migration to the new
- *       padding system. It will be removed once all kernels have been ported.
- *
- * */
-class AccessWindowAutoPadding : public IAccessWindow
-{
-public:
-    /** Default constructor.
-     *
-     * @param[in,out] info Tensor info of the accessed kernel.
-     */
-    AccessWindowAutoPadding(ITensorInfo *info);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowAutoPadding(const AccessWindowAutoPadding &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowAutoPadding &operator=(const AccessWindowAutoPadding &) = delete;
-    /** Allow instances of this class to be move constructed */
-    AccessWindowAutoPadding(AccessWindowAutoPadding &&) = default;
-    /** Allow instances of this class to be moved */
-    AccessWindowAutoPadding &operator=(AccessWindowAutoPadding &&) = default;
-    /** Default destructor */
-    ~AccessWindowAutoPadding() = default;
-
-    /** Set the valid region to match the entire tensor. */
-    void set_valid_region();
-
-    /** Return a valid region that spans across the entire tensor.
-     *
-     * @return a valid region.
-     *
-     */
-    ValidRegion compute_valid_region() const;
-
-    // Inherited methods overridden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
-
-private:
-    ITensorInfo *_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H*/
diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h
deleted file mode 100644
index e40c188fcd..0000000000
--- a/arm_compute/core/AccessWindowStatic.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IACCESS_WINDOW_STATIC_H
-#define ARM_COMPUTE_IACCESS_WINDOW_STATIC_H
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class Window;
-class ITensorInfo;
-
-/** Implementation of a static rectangular access pattern.
- *
- * In this implementation the access offsets and sizes are not relative to the
- * current element. Instead they are considered to be absolute coordinates
- * within the accessed tensor's shape.
- *
- * */
-class AccessWindowStatic : public IAccessWindow
-{
-public:
-    /** Constructor for a static access pattern.
-     *
-     * @param[in,out] info    Tensor info of the accessed kernel.
-     * @param[in]     start_x Start of the access in X direction.
-     * @param[in]     start_y Start of the access in Y direction.
-     * @param[in]     end_x   End of the access in X direction.
-     * @param[in]     end_y   End of the access in Y direction.
-     */
-    AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y);
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowStatic(const AccessWindowStatic &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowStatic &operator=(const AccessWindowStatic &) = delete;
-    /** Allow instances of this class to be move constructed */
-    AccessWindowStatic(AccessWindowStatic &&) = default;
-    /** Allow instances of this class to be moved */
-    AccessWindowStatic &operator=(AccessWindowStatic &&) = default;
-    /** Default destructor */
-    ~AccessWindowStatic() = default;
-
-    /** Set the valid region based on the static access pattern and valid
-     *  region of the inputs.
-     *
-     * @param[in] window             Execution window of the kernel.
-     * @param[in] input_valid_region Combined valid region of all inputs.
-     */
-    void set_valid_region(const Window &window, const ValidRegion &input_valid_region);
-
-    /** Compute the valid region based on the static access pattern and valid region of the inputs.
-     *
-     * @param[in] window             Execution window of the kernel.
-     * @param[in] input_valid_region Combined valid region of all inputs.
-     *
-     * @return a valid region.
-     *
-     */
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
-
-    // Inherited methods overriden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
-
-private:
-    ITensorInfo *_info;
-    int          _start_x;
-    int          _start_y;
-    int          _end_x;
-    int          _end_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_IACCESS_WINDOW_STATIC_H*/
diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/core/AccessWindowTranspose.h
deleted file mode 100644
index 16105bce7c..0000000000
--- a/arm_compute/core/AccessWindowTranspose.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H
-#define ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class Window;
-class ITensorInfo;
-
-/** Implementation of a XY-transpose access pattern. */
-class AccessWindowTranspose : public AccessWindowRectangle
-{
-public:
-    using AccessWindowRectangle::AccessWindowRectangle;
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    using AccessWindowRectangle::compute_valid_region;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index 2b6d8cd2cb..dcd3b45670 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,8 @@ public:
      */
     const StringSet &options() const;
 
+    bool operator==(const CLBuildOptions &other) const;
+
 private:
     StringSet _build_opts; /**< Build options set */
 };
@@ -118,6 +120,14 @@ public:
     {
         return _name;
     }
+    /** Returns program binary data.
+     *
+     * @return Program's binary data.
+     */
+    const std::vector<unsigned char> &binary() const
+    {
+        return _binary;
+    }
     /** User-defined conversion to the underlying CL program.
      *
      * @return The CL program object.
@@ -240,8 +250,12 @@ public:
      *
      * @return The created kernel.
      */
-    Kernel create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                         const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const;
+    Kernel create_kernel(const std::string &kernel_name,
+                         const std::string &program_name,
+                         const std::string &program_source,
+                         const std::string &kernel_path,
+                         const StringSet   &build_options_set,
+                         bool               is_binary) const;
 
     /** Clear the library's cache of binary programs
      */
@@ -288,6 +302,24 @@ public:
      */
     bool int64_base_atomics_supported() const;
 
+    /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+    *
+    * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+    */
+    bool is_wbsm_supported() const;
+
+    /** Return the DDK version. If the DDK version cannot be detected, return -1.
+     *
+     * @return The DDK version.
+     */
+    int32_t get_ddk_version() const;
+
+    /** Return the Gpu target of the associated device
+     *
+     * @return GPUTarget
+     */
+    GPUTarget get_gpu_target() const;
+
 private:
     /** Load program and its dependencies.
      *
@@ -295,7 +327,8 @@ private:
      * @param[in] program_source Source of the program.
      * @param[in] is_binary      Flag to indicate if the program source is binary.
      */
-    const Program &load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
+    const Program &
+    load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
 
     /** Generates the build options given a string of user defined ones
      *
@@ -315,10 +348,11 @@ private:
      */
     std::string stringify_set(const StringSet &s, const std::string &kernel_path) const;
 
-    cl::Context _context;                                             /**< Underlying CL context. */
-    CLDevice    _device;                                              /**< Underlying CL device. */
+    cl::Context                                  _context;            /**< Underlying CL context. */
+    CLDevice                                     _device;             /**< Underlying CL device. */
     mutable std::map<std::string, const Program> _programs_map;       /**< Map with all already loaded program data. */
     mutable std::map<std::string, cl::Program>   _built_programs_map; /**< Map with all already built program data. */
+    bool _is_wbsm_supported; /**< Support of worksize batch size modifier support boolean*/
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */
diff --git a/arm_compute/core/CL/CLCoreRuntimeContext.h b/arm_compute/core/CL/CLCoreRuntimeContext.h
deleted file mode 100644
index 2b2269dece..0000000000
--- a/arm_compute/core/CL/CLCoreRuntimeContext.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCORERUNTIME_CONTEXT_H
-#define ARM_COMPUTE_CLCORERUNTIME_CONTEXT_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class CLKernelLibrary;
-
-/** Core runtime context for OpenCL */
-class CLCoreRuntimeContext final
-{
-public:
-    /** Legacy constructor */
-    CLCoreRuntimeContext();
-
-    /** Constructor */
-    CLCoreRuntimeContext(CLKernelLibrary *kernel_lib, cl::Context ctx, cl::CommandQueue queue);
-    /** Destructor */
-    ~CLCoreRuntimeContext() = default;
-    /** Default copy constructor */
-    CLCoreRuntimeContext(const CLCoreRuntimeContext &) = default;
-    /** Default move constructor */
-    CLCoreRuntimeContext(CLCoreRuntimeContext &&) = default;
-    /** Default copy assignment */
-    CLCoreRuntimeContext &operator=(const CLCoreRuntimeContext &) = default;
-    /** Default move assignment operator */
-    CLCoreRuntimeContext &operator=(CLCoreRuntimeContext &&) = default;
-    /** Kernel Library accessor
-     *
-     * @return The kernel library instance used by the core context
-     */
-    CLKernelLibrary *kernel_library() const;
-    /** OpenCL context accessor
-     *
-     * @return The OpenCL context used by the core context
-     */
-    cl::Context context();
-    /** OpenCL command queue accessor
-     *
-     * @return The OpenCL queue used by the core context
-     */
-    cl::CommandQueue queue();
-
-private:
-    CLKernelLibrary *_kernel_lib{ nullptr };
-    cl::Context      _ctx{};
-    cl::CommandQueue _queue{};
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCORERUNTIME_CONTEXT_H */
diff --git a/arm_compute/core/CL/CLDevice.h b/arm_compute/core/CL/CLDevice.h
index 812834743d..ded6bb8493 100644
--- a/arm_compute/core/CL/CLDevice.h
+++ b/arm_compute/core/CL/CLDevice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/core/IDevice.h"
 
 #include <set>
+#include <sstream>
 #include <string>
 
 namespace arm_compute
@@ -43,8 +44,7 @@ class CLDevice : public IDevice
 {
 public:
     /** Default Constructor */
-    CLDevice()
-        : _device(cl::Device()), _options()
+    CLDevice() : _device(cl::Device()), _options()
     {
     }
 
@@ -52,8 +52,7 @@ public:
      *
      * @param[in] cl_device OpenCL device
      */
-    CLDevice(const cl::Device &cl_device)
-        : _device(), _options()
+    CLDevice(const cl::Device &cl_device) : _device(), _options()
     {
         _device = cl_device;
 
@@ -65,13 +64,13 @@ public:
         std::string extensions = _device.getInfo<CL_DEVICE_EXTENSIONS>();
 
         std::istringstream iss(extensions);
-        for(std::string s; iss >> s;)
+        for (std::string s; iss >> s;)
         {
             _options.extensions.insert(s);
         }
 
         // SW workaround for G76
-        if(_options.gpu_target == GPUTarget::G76)
+        if (_options.gpu_target == GPUTarget::G76)
         {
             _options.extensions.insert("cl_arm_integer_dot_product_int8");
         }
@@ -142,6 +141,32 @@ public:
         return _options.extensions.count(extension) != 0;
     }
 
+    /** Returns whether non-uniform workgroup is supported and the build options.
+     *
+     * If the feature is supported, the appropriate build options will be
+     * appended to the specified string.
+     *
+     * @return A tuple (supported, build_options) indicating whether the feature
+     *         is supported and the corresponding build options to enable it.
+     */
+    std::tuple<bool, std::string> is_non_uniform_workgroup_supported() const
+    {
+        if (version() == CLVersion::CL30 && get_cl_non_uniform_work_group_supported(_device))
+        {
+            return {true, " -cl-std=CL3.0 "};
+        }
+        else if (version() == CLVersion::CL20)
+        {
+            return {true, " -cl-std=CL2.0 "};
+        }
+        else if (supported("cl_arm_non_uniform_work_group_size"))
+        {
+            return {true, " -cl-arm-non-uniform-work-group-size "};
+        }
+
+        return {false, ""};
+    }
+
 private:
     cl::Device             _device;  /**< OpenCL device. */
     struct CLDeviceOptions _options; /**< OpenCL device options */
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index fc3f4d5db0..1a639e47f9 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,13 @@
 
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
 
 #include <set>
 #include <string>
 
 namespace arm_compute
 {
-class CLCoreRuntimeContext;
 class CLCompileContext;
 class CLBuildOptions;
 
@@ -41,6 +41,9 @@ enum class DataType;
 /** Max vector width of an OpenCL vector */
 static constexpr unsigned int max_cl_vector_width = 16;
 
+/** Max number of manual loop unrolling */
+static constexpr int max_manual_loop_unrolling = 128;
+
 /** Translates a tensor data type to the appropriate OpenCL type.
  *
  * @param[in] dt @ref DataType to be translated to OpenCL type.
@@ -97,14 +100,6 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt);
  */
 std::string get_data_size_from_data_type(const DataType &dt);
 
-/** Translates fixed point tensor data type to the underlying OpenCL type.
- *
- * @param[in] dt @ref DataType to be translated to OpenCL type.
- *
- * @return The string specifying the underlying OpenCL type to be used.
- */
-std::string get_underlying_cl_type_from_data_type(const DataType &dt);
-
 /** Helper function to get the GPU target from CL device
  *
  * @param[in] device A CL device
@@ -129,6 +124,14 @@ CLVersion get_cl_version(const cl::Device &device);
  */
 size_t get_cl_image_pitch_alignment(const cl::Device &device);
 
+/** Helper function to check whether non-uniform work group is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the feature is supported
+ */
+bool get_cl_non_uniform_work_group_supported(const cl::Device &device);
+
 /** Helper function to check whether a given extension is supported
  *
  * @param[in] device         A CL device
@@ -176,7 +179,9 @@ bool dot8_acc_supported(const cl::Device &device);
  *
  * @return True if the configuration is supported
  */
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout);
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout);
 
 /** Helper function to get the preferred native vector width size for built-in scalar types that can be put into vectors
  *
@@ -204,16 +209,6 @@ bool preferred_dummy_work_items_support(const cl::Device &device);
  */
 bool image2d_from_buffer_supported(const cl::Device &device);
 
-/** Creates an opencl kernel
- *
- * @param[in] ctx         A context to be used to create the opencl kernel.
- * @param[in] kernel_name The kernel name.
- * @param[in] build_opts  The build options to be used for the opencl kernel compilation.
- *
- * @return An opencl kernel
- */
-cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &kernel_name, const CLBuildOptions &build_opts);
-
 /** Creates an opencl kernel using a compile context
  *
  * @param[in] ctx         A compile context to be used to create the opencl kernel.
@@ -222,7 +217,9 @@ cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &ke
  *
  * @return An opencl kernel
  */
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts = std::set<std::string>());
+cl::Kernel create_kernel(const CLCompileContext      &ctx,
+                         const std::string           &kernel_name,
+                         const std::set<std::string> &build_opts = std::set<std::string>());
 
 /** Creates a suitable LWS hint object for parallel implementations. Sets the number of WG based on the input size.
  *  If input width is smaller than 128 we can use fewer threads than 8.
@@ -234,5 +231,62 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_
  */
 cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size);
 
+/* Helper function to check if the workgroup batch size modifier parameter is supported on the cl device
+ *
+ * @param[in] device cl device to check for support
+ *
+ * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+ */
+bool get_wbsm_support_info(const cl::Device &device);
+
+/* Helper function to set the workgroup batch size modifier parameter in the kernel
+ *
+ * @param[in] kernel    cl kernel to set the workgroup batch size modifier parameter
+ * @param[in] wbsm_hint workgroup batch size modifier to use
+ */
+void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint);
+
+/* Helper function to check if we can export the tensor to cl_image
+ *
+ * @param[in] input tensor
+ *
+ * @return true if we can export the tensor to cl_image
+ */
+bool export_to_cl_image(const ITensorInfo *tensor);
+
+/* Helper function to force unroll with pragma when any of the input values (iterations) are greater than @ref max_manual_loop_unrolling
+ *
+ * This function passes UNROLL_WITH_PRAGMA at compile time when any of the input values are greater than @ref max_manual_loop_unrolling
+ *
+ * @param[in] built_opts OpenCL kernel build options
+ * @param[in] values     Input values (iterations)
+ *
+ */
+void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values);
+
+/** Helper function to check whether the cl_arm_matrix_multiply extension is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the extension is supported
+ */
+bool arm_matrix_multiply_supported(const cl::Device &device);
+
+/** Check whether cl_khr_command_buffer extension is supported by the specified CL device.
+ *
+ * @param[in] device The CL device
+ *
+ * @return True if the extension is supported by the CL device.
+ */
+bool command_buffer_supported(const cl::Device &device);
+
+/** Check whether cl_khr_command_buffer_mutable_dispatch extension is supported by the specified CL device.
+ *
+ * @param[in] device The CL device
+ *
+ * @return True if the extension is supported by the CL device.
+ */
+bool command_buffer_mutable_dispatch_supported(const cl::Device &device);
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHELPERS_H */
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index 6c5df6cb08..527733ccf1 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@ private:
 
 public:
     /** Access the KernelLibrary singleton.
-     * This method has been deprecated and will be removed in the next release.
+     * This method has been deprecated and will be removed in future releases
      * @return The KernelLibrary instance.
      */
     static CLKernelLibrary &get();
@@ -148,6 +148,12 @@ public:
      */
     std::string get_program_name(const std::string &kernel_name) const;
 
+    /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+    *
+    * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+    */
+    bool is_wbsm_supported();
+
     /** Sets the CL context used to create programs.
      *
      * @note Setting the context also resets the device to the
@@ -164,11 +170,7 @@ public:
     CLCompileContext &get_compile_context();
 
 private:
-    CLCompileContext _compile_context;                                   /**< Compile Context. */
-    std::string      _kernel_path;                                       /**< Path to the kernels folder. */
-    static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
-                                                                            Used for compile-time kernel inclusion. >*/
+    CLCompileContext _compile_context; /**< Compile Context. */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLKERNELLIBRARY_H */
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
deleted file mode 100644
index cd26399390..0000000000
--- a/arm_compute/core/CL/CLKernels.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLKERNELS_H
-#define ARM_COMPUTE_CLKERNELS_H
-
-/* Header regrouping all the CL kernels */
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-#endif /* ARM_COMPUTE_CLKERNELS_H */
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
index 3643b178d3..0f088e2b10 100644
--- a/arm_compute/core/CL/CLTypes.h
+++ b/arm_compute/core/CL/CLTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,8 @@ enum class CLVersion
     CL10,   /* the OpenCL 1.0 */
     CL11,   /* the OpenCL 1.1 */
     CL12,   /* the OpenCL 1.2 */
-    CL20,   /* the OpenCL 2.0 and above */
+    CL20,   /* the OpenCL 2.x */
+    CL30,   /* the OpenCL 3.x */
     UNKNOWN /* unkown version */
 };
 
@@ -62,18 +63,27 @@ struct CLDeviceOptions
 struct CLQuantization
 {
     /** Default Constructor */
-    CLQuantization()
-        : scale(nullptr), offset(nullptr) {};
+    CLQuantization() : scale(nullptr), offset(nullptr){};
     /** Constructor
      *
      * @param[in] scale  OpenCL scale array
      * @param[in] offset OpenCL offset array
      */
-    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset)
-        : scale(scale), offset(offset) {};
+    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset) : scale(scale), offset(offset){};
 
     const ICLFloatArray *scale;  /**< Quantization scale array */
     const ICLInt32Array *offset; /**< Quantization offset array */
 };
+
+enum CLKernelType
+{
+    UNKNOWN,     /**< Unknown CL kernel type */
+    DEPTHWISE,   /**< Depthwise CL kernel type */
+    DIRECT,      /**< Direct Convolution CL kernel type */
+    ELEMENTWISE, /**< Elementwise CL kernel type */
+    GEMM,        /**< GEMM CL kernel type */
+    POOL,        /**< Pool CL kernel type */
+    WINOGRAD     /**< Winograd CL kernel type */
+};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_TYPES_H */
diff --git a/arm_compute/core/CL/CLValidate.h b/arm_compute/core/CL/CLValidate.h
deleted file mode 100644
index 8f1733dcfe..0000000000
--- a/arm_compute/core/CL/CLValidate.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_VALIDATE_H
-#define ARM_COMPUTE_CL_VALIDATE_H
-
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
-
-/** Return an error if int64_base_atomics extension is not supported by the device.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- *
- * @return Status
- */
-inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
-{
-    if(!CLKernelLibrary::get().int64_base_atomics_supported())
-    {
-        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
-    }
-    return arm_compute::Status{};
-}
-
-#define ARM_COMPUTE_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
-
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_VALIDATE_H */
diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
index e11fb95bf8..a2b2baa5b3 100644
--- a/arm_compute/core/CL/ICLArray.h
+++ b/arm_compute/core/CL/ICLArray.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,8 +40,7 @@ public:
      * @param[in] max_num_values Maximum size of the array.
      *
      */
-    explicit ICLArray(size_t max_num_values)
-        : IArray<T>(max_num_values), _mapping(nullptr)
+    explicit ICLArray(size_t max_num_values) : IArray<T>(max_num_values), _mapping(nullptr)
     {
     }
 
@@ -66,8 +65,6 @@ public:
      * @param[in]     blocking If true, then the mapping will be ready to use by the time
      *                         this method returns, else it is the caller's responsibility
      *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
      */
     void map(cl::CommandQueue &q, bool blocking = true)
     {
@@ -115,14 +112,6 @@ private:
     uint8_t *_mapping;
 };
 
-/** Interface for OpenCL Array of Key Points. */
-using ICLKeyPointArray = ICLArray<KeyPoint>;
-/** Interface for OpenCL Array of 2D Coordinates. */
-using ICLCoordinates2DArray = ICLArray<Coordinates2D>;
-/** Interface for OpenCL Array of Detection Windows. */
-using ICLDetectionWindowArray = ICLArray<DetectionWindow>;
-/** Interface for OpenCL Array of 2D Sizes. */
-using ICLSize2DArray = ICLArray<Size2D>;
 /** Interface for OpenCL Array of uint8s. */
 using ICLUInt8Array = ICLArray<cl_uchar>;
 /** Interface for OpenCL Array of uint16s. */
@@ -135,5 +124,5 @@ using ICLInt16Array = ICLArray<cl_short>;
 using ICLInt32Array = ICLArray<cl_int>;
 /** Interface for OpenCL Array of floats. */
 using ICLFloatArray = ICLArray<cl_float>;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLARRAY_H*/
diff --git a/arm_compute/core/CL/ICLDistribution1D.h b/arm_compute/core/CL/ICLDistribution1D.h
deleted file mode 100644
index a9bafe3d5a..0000000000
--- a/arm_compute/core/CL/ICLDistribution1D.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLDISTRIBUTION1D_H
-#define ARM_COMPUTE_ICLDISTRIBUTION1D_H
-
-#include "arm_compute/core/IDistribution1D.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-class CommandQueue;
-}
-
-namespace arm_compute
-{
-/** ICLDistribution1D interface class */
-class ICLDistribution1D : public IDistribution1D
-{
-public:
-    /** Constructor: Creates a 1D CLDistribution of a consecutive interval [offset, offset + range - 1]
-     *               defined by a start offset and valid range, divided equally into num_bins parts.
-     *
-     * @param[in] num_bins The number of bins the distribution is divided in.
-     * @param[in] offset   The start of the values to use.
-     * @param[in] range    The total number of the consecutive values of the distribution interval.
-     */
-    ICLDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDistribution1D(const ICLDistribution1D &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const ICLDistribution1D &operator=(const ICLDistribution1D &) = delete;
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    void map(cl::CommandQueue &q, bool blocking = true);
-    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    void unmap(cl::CommandQueue &q);
-    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the distribution's data.
-     *
-     * @return A reference to an OpenCL buffer containing the distribution's data.
-     */
-    virtual cl::Buffer &cl_buffer() = 0;
-    // Inherited methods overridden:
-    uint32_t *buffer() const override;
-
-protected:
-    /** Method to be implemented by the child class to map the OpenCL buffer
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint32_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the OpenCL buffer
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    virtual void do_unmap(cl::CommandQueue &q) = 0;
-
-protected:
-    uint32_t *_mapping; /**< The distribution data. */
-};
-}
-#endif /* ARM_COMPUTE_ICLDISTRIBUTION1D_H */
diff --git a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
deleted file mode 100644
index e5f4a78297..0000000000
--- a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H
-#define ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-/** Basic interface for the GEMM kernel configuration */
-class ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] arch GPU target
-     */
-    ICLGEMMKernelConfiguration(GPUTarget arch)
-        : _target(arch)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLGEMMKernelConfiguration(const ICLGEMMKernelConfiguration &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLGEMMKernelConfiguration &operator=(const ICLGEMMKernelConfiguration &) = delete;
-    /** Default Move Constructor. */
-    ICLGEMMKernelConfiguration(ICLGEMMKernelConfiguration &&) = default;
-    /** Default move assignment operator */
-    ICLGEMMKernelConfiguration &operator=(ICLGEMMKernelConfiguration &&) = default;
-    /** Virtual destructor */
-    virtual ~ICLGEMMKernelConfiguration() = default;
-    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
-     *
-     * @param[in] m         Number of rows LHS matrix
-     * @param[in] n         Number of columns RHS matrix
-     * @param[in] k         Number of columns LHS matrix or number of rows RHS matrix
-     * @param[in] b         Batch size
-     * @param[in] data_type Data type
-     */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
-
-protected:
-    GPUTarget _target;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/ICLHOG.h b/arm_compute/core/CL/ICLHOG.h
deleted file mode 100644
index b42566ef11..0000000000
--- a/arm_compute/core/CL/ICLHOG.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLHOG_H
-#define ARM_COMPUTE_ICLHOG_H
-
-#include "arm_compute/core/IHOG.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-class CommandQueue;
-}
-
-namespace arm_compute
-{
-/** Interface for OpenCL HOG data-object */
-class ICLHOG : public IHOG
-{
-public:
-    /** Default constructor */
-    ICLHOG();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLHOG(const ICLHOG &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLHOG &operator=(const ICLHOG &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLHOG(ICLHOG &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLHOG &operator=(ICLHOG &&) = default;
-    /** Default destructor */
-    virtual ~ICLHOG() = default;
-
-    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the hog's descriptor
-     *
-     * @return A reference to an OpenCL buffer containing the hog's descriptor
-     */
-    virtual const cl::Buffer &cl_buffer() const = 0;
-
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
-     */
-    void map(cl::CommandQueue &q, bool blocking = true);
-
-    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    void unmap(cl::CommandQueue &q);
-
-    /** Interface to be implemented by the child class to free the allocated cl buffer.
-     *
-     * @warning The buffer must have been allocated previously. Otherwise calling the function will fail.
-     */
-    virtual void free() = 0;
-
-    // Inherited methods overridden:
-    float *descriptor() const override;
-
-protected:
-    /** Method to be implemented by the child class to map the OpenCL buffer
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the OpenCL buffer
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    virtual void do_unmap(cl::CommandQueue &q) = 0;
-
-private:
-    uint8_t *_mapping;
-};
-}
-#endif /*ARM_COMPUTE_ICLHOG_H */
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
deleted file mode 100644
index 3e545c61aa..0000000000
--- a/arm_compute/core/CL/ICLKernel.h
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLKERNEL_H
-#define ARM_COMPUTE_ICLKERNEL_H
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/IKernel.h"
-
-#include <string>
-
-namespace arm_compute
-{
-template <typename T>
-class ICLArray;
-class ICLTensor;
-class Window;
-
-/** Common interface for all the OpenCL kernels */
-class ICLKernel : public IKernel
-{
-private:
-    /** Returns the number of arguments enqueued per array object.
-     *
-     * @return The number of arguments enqueued per array object.
-     */
-    template <unsigned int        dimension_size>
-    constexpr static unsigned int num_arguments_per_array()
-    {
-        return num_arguments_per_tensor<dimension_size>();
-    }
-    /** Returns the number of arguments enqueued per tensor object.
-     *
-     * @return The number of arguments enqueued per tensor object.
-     */
-    template <unsigned int        dimension_size>
-    constexpr static unsigned int num_arguments_per_tensor()
-    {
-        return 2 + 2 * dimension_size;
-    }
-    using IKernel::configure; //Prevent children from calling IKernel::configure() directly
-protected:
-    /** Configure the kernel's window and local workgroup size hint.
-     *
-     * @param[in] window   The maximum window which will be returned by window()
-     * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
-     */
-    void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
-    {
-        _lws_hint = lws_hint;
-        IKernel::configure(window);
-    }
-
-public:
-    /** Constructor */
-    ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
-    {
-    }
-    /** Returns a reference to the OpenCL kernel of this object.
-     *
-     * @return A reference to the OpenCL kernel of this object.
-     */
-    cl::Kernel &kernel()
-    {
-        return _kernel;
-    }
-    /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     array          Array to set as an argument of the object's kernel.
-     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
-     * @param[in]     num_dimensions Number of dimensions of the @p array.
-     * @param[in]     window         Window the kernel will be executed on.
-     */
-    template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
-    {
-        add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
-    }
-    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<1>(idx, tensor, window);
-    }
-    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
-     *
-     * @param[in]     cond   Condition to check
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        if(cond)
-        {
-            add_1D_tensor_argument(idx, tensor, window);
-        }
-    }
-    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<2>(idx, tensor, window);
-    }
-    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
-     *
-     * @param[in]     cond   Condition to check
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        if(cond)
-        {
-            add_2D_tensor_argument(idx, tensor, window);
-        }
-    }
-    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<3>(idx, tensor, window);
-    }
-    /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<4>(idx, tensor, window);
-    }
-    /** Returns the number of arguments enqueued per 1D array object.
-     *
-     * @return The number of arguments enqueues per 1D array object.
-     */
-    constexpr static unsigned int num_arguments_per_1D_array()
-    {
-        return num_arguments_per_array<1>();
-    }
-    /** Returns the number of arguments enqueued per 1D tensor object.
-     *
-     * @return The number of arguments enqueues per 1D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_1D_tensor()
-    {
-        return num_arguments_per_tensor<1>();
-    }
-    /** Returns the number of arguments enqueued per 2D tensor object.
-     *
-     * @return The number of arguments enqueues per 2D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_2D_tensor()
-    {
-        return num_arguments_per_tensor<2>();
-    }
-    /** Returns the number of arguments enqueued per 3D tensor object.
-     *
-     * @return The number of arguments enqueues per 3D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_3D_tensor()
-    {
-        return num_arguments_per_tensor<3>();
-    }
-    /** Returns the number of arguments enqueued per 4D tensor object.
-     *
-     * @return The number of arguments enqueues per 4D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_4D_tensor()
-    {
-        return num_arguments_per_tensor<4>();
-    }
-    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
-     *
-     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
-     *
-     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     * @param[in,out] queue  Command queue on which to enqueue the kernel.
-     */
-    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
-    /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     value Value to set as an argument of the object's kernel.
-     */
-    template <typename T>
-    void add_argument(unsigned int &idx, T value)
-    {
-        _kernel.setArg(idx++, value);
-    }
-
-    /** Set the Local-Workgroup-Size hint
-     *
-     * @note This method should be called after the configuration of the kernel
-     *
-     * @param[in] lws_hint Local-Workgroup-Size to use
-     */
-    void set_lws_hint(const cl::NDRange &lws_hint)
-    {
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
-        _lws_hint = lws_hint;
-    }
-
-    /** Return the Local-Workgroup-Size hint
-     *
-     * @return Current lws hint
-     */
-    cl::NDRange lws_hint() const
-    {
-        return _lws_hint;
-    }
-
-    /** Get the configuration ID
-     *
-     * @note The configuration ID can be used by the caller to distinguish different calls of the same OpenCL kernel
-     *       In particular, this method can be used by CLScheduler to keep track of the best LWS for each configuration of the same kernel.
-     *       The configuration ID should be provided only for the kernels potentially affected by the LWS geometry
-     *
-     * @note This method should be called after the configuration of the kernel
-     *
-     * @return configuration id string
-     */
-    const std::string &config_id() const
-    {
-        return _config_id;
-    }
-
-    /** Set the targeted GPU architecture
-     *
-     * @param[in] target The targeted GPU architecture
-     */
-    void set_target(GPUTarget target)
-    {
-        _target = target;
-    }
-
-    /** Set the targeted GPU architecture according to the CL device
-     *
-     * @param[in] device A CL device
-     */
-    void set_target(cl::Device &device);
-
-    /** Get the targeted GPU architecture
-     *
-     * @return The targeted GPU architecture.
-     */
-    GPUTarget get_target() const
-    {
-        return _target;
-    }
-
-    /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
-     *
-     * @return The maximum workgroup size value.
-     */
-    size_t get_max_workgroup_size();
-    /** Get the global work size given an execution window
-     *
-     * @param[in] window Execution window
-     *
-     * @return Global work size of the given execution window
-     */
-    static cl::NDRange gws_from_window(const Window &window);
-
-private:
-    /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     array          Array to set as an argument of the object's kernel.
-     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
-     * @param[in]     num_dimensions Number of dimensions of the @p array.
-     * @param[in]     window         Window the kernel will be executed on.
-     */
-    template <typename T, unsigned int dimension_size>
-    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
-    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    template <unsigned int dimension_size>
-    void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
-
-protected:
-    cl::Kernel  _kernel;             /**< OpenCL kernel to run */
-    GPUTarget   _target;             /**< The targeted GPU */
-    std::string _config_id;          /**< Configuration ID */
-    size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
-private:
-    cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
-};
-
-/** Add the kernel to the command queue with the given window.
- *
- * @note Depending on the size of the window, this might translate into several jobs being enqueued.
- *
- * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
- *
- * @param[in,out] queue                OpenCL command queue.
- * @param[in]     kernel               Kernel to enqueue
- * @param[in]     window               Window the kernel has to process.
- * @param[in]     lws_hint             (Optional) Local workgroup size requested. Default is based on the device target.
- * @param[in]     use_dummy_work_items (Optional) Use dummy work items in order to have two dimensional power of two NDRange. Default is false
- *                                     Note: it is kernel responsibility to check if the work-item is out-of-range
- *
- * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
- */
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
-
-/** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
- *
- * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
- * @param[in]     array          Array to set as an argument of the object's kernel.
- * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
- * @param[in]     num_dimensions Number of dimensions of the @p array.
- * @param[in]     window         Window the kernel will be executed on.
- */
-template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON(array == nullptr);
-
-    // Calculate offset to the start of the window
-    unsigned int offset_first_element = 0;
-
-    for(unsigned int n = 0; n < num_dimensions; ++n)
-    {
-        offset_first_element += window[n].start() * strides[n];
-    }
-
-    unsigned int idx_start = idx;
-    _kernel.setArg(idx++, array->cl_buffer());
-
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
-    {
-        _kernel.setArg<cl_uint>(idx++, strides[dimension]);
-        _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
-    }
-
-    _kernel.setArg<cl_uint>(idx++, offset_first_element);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
-                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
-    ARM_COMPUTE_UNUSED(idx_start);
-}
-}
-#endif /*ARM_COMPUTE_ICLKERNEL_H */
diff --git a/arm_compute/core/CL/ICLLut.h b/arm_compute/core/CL/ICLLut.h
deleted file mode 100644
index 430adb8727..0000000000
--- a/arm_compute/core/CL/ICLLut.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLLUT_H
-#define ARM_COMPUTE_ICLLUT_H
-
-#include "arm_compute/core/ILut.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-class CommandQueue;
-}
-
-namespace arm_compute
-{
-/** Interface for OpenCL LUT */
-class ICLLut : public ILut
-{
-public:
-    ICLLut();
-    ICLLut(const ICLLut &) = delete;
-    ICLLut &operator=(const ICLLut &) = delete;
-
-    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the lut's data.
-     *
-     * @return A reference to an OpenCL buffer containing the lut's data.
-     */
-    virtual const cl::Buffer &cl_buffer() const = 0;
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    void map(cl::CommandQueue &q, bool blocking = true);
-    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    void unmap(cl::CommandQueue &q);
-
-    // Inherited methods overridden:
-    uint8_t *buffer() const override;
-
-protected:
-    /** Method to be implemented by the child class to map the OpenCL buffer
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the OpenCL buffer
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    virtual void do_unmap(cl::CommandQueue &q) = 0;
-
-private:
-    uint8_t *_mapping;
-};
-}
-#endif /*ARM_COMPUTE_ICLLUT_H */
diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h
deleted file mode 100644
index f9213018a2..0000000000
--- a/arm_compute/core/CL/ICLMultiHOG.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLMULTIHOG_H
-#define ARM_COMPUTE_ICLMULTIHOG_H
-
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/IMultiHOG.h"
-
-namespace arm_compute
-{
-/** Interface for storing multiple HOG data-objects */
-class ICLMultiHOG : public IMultiHOG
-{
-public:
-    /** Return a pointer to the requested OpenCL HOG model
-     *
-     * @param[in] index The index of the wanted OpenCL HOG model.
-     *
-     *  @return A pointer pointed to the HOG model
-     */
-    virtual ICLHOG *cl_model(size_t index) = 0;
-    /** Return a constant pointer to the requested OpenCL HOG model
-     *
-     * @param[in] index The index of the wanted OpenCL HOG model.
-     *
-     *  @return A constant pointer pointed to the OpenCL HOG model
-     */
-    virtual const ICLHOG *cl_model(size_t index) const = 0;
-
-    // Inherited methods overridden:
-    IHOG *model(size_t index) override;
-    const IHOG *model(size_t index) const override;
-};
-}
-#endif /*ARM_COMPUTE_ICLMULTIHOG_H */
diff --git a/arm_compute/core/CL/ICLMultiImage.h b/arm_compute/core/CL/ICLMultiImage.h
deleted file mode 100644
index 0233600e73..0000000000
--- a/arm_compute/core/CL/ICLMultiImage.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLMULTIIMAGE_H
-#define ARM_COMPUTE_ICLMULTIIMAGE_H
-
-#include "arm_compute/core/IMultiImage.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for OpenCL images */
-using ICLImage = ICLTensor;
-
-/** Interface for OpenCL multi-planar images */
-class ICLMultiImage : public IMultiImage
-{
-public:
-    /** Return a pointer to the requested OpenCL plane of the image.
-     *
-     * @param[in] index The index of the wanted planed.
-     *
-     * @return A pointer pointed to the OpenCL plane
-     */
-    virtual ICLImage *cl_plane(unsigned int index) = 0;
-    /** Return a constant pointer to the requested OpenCL plane of the image.
-     *
-     * @param[in] index The index of the wanted planed.
-     *
-     * @return A constant pointer pointed to the OpenCL plane
-     */
-    virtual const ICLImage *cl_plane(unsigned int index) const = 0;
-
-    // Inherited methods overridden:
-    IImage *plane(unsigned int index) override;
-    const IImage *plane(unsigned int index) const override;
-};
-}
-#endif /*ARM_COMPUTE_ICLMULTIIMAGE_H */
diff --git a/arm_compute/core/CL/ICLSimple2DKernel.h b/arm_compute/core/CL/ICLSimple2DKernel.h
deleted file mode 100644
index bd423303bb..0000000000
--- a/arm_compute/core/CL/ICLSimple2DKernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
-#define ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
-class ICLSimple2DKernel : public ICLSimpleKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-}
-#endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */
diff --git a/arm_compute/core/CL/ICLSimple3DKernel.h b/arm_compute/core/CL/ICLSimple3DKernel.h
deleted file mode 100644
index e25051f578..0000000000
--- a/arm_compute/core/CL/ICLSimple3DKernel.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
-#define ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output.
- *  Both input tensor and output tensor must have at least 3 dimensions.
- */
-class ICLSimple3DKernel : public ICLSimple2DKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-}
-#endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */
diff --git a/arm_compute/core/CL/ICLSimpleKernel.h b/arm_compute/core/CL/ICLSimpleKernel.h
deleted file mode 100644
index e8b6f0a81c..0000000000
--- a/arm_compute/core/CL/ICLSimpleKernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLSIMPLEKERNEL_H
-#define ARM_COMPUTE_ICLSIMPLEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-
-namespace arm_compute
-{
-/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output */
-class ICLSimpleKernel : public ICLKernel
-{
-public:
-    /** Constructor. */
-    ICLSimpleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLSimpleKernel(const ICLSimpleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLSimpleKernel &operator=(const ICLSimpleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLSimpleKernel(ICLSimpleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLSimpleKernel &operator=(ICLSimpleKernel &&) = default;
-    /** Default destructor */
-    ~ICLSimpleKernel() = default;
-
-    /** Configure the kernel
-     *
-     * @param[in]  input                             Source tensor.
-     * @param[out] output                            Destination tensor.
-     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  border_size                       (Optional) Size of the border.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
-
-protected:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-}
-
-#endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */
diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
index 001f892231..8de5423762 100644
--- a/arm_compute/core/CL/ICLTensor.h
+++ b/arm_compute/core/CL/ICLTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_ICLTENSOR_H
 #define ARM_COMPUTE_ICLTENSOR_H
 
-#include "arm_compute/core/ITensor.h"
-
 #include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/ITensor.h"
 
 #include <cstdint>
 
@@ -34,7 +33,7 @@ namespace cl
 {
 class Buffer;
 class CommandQueue;
-}
+} // namespace cl
 
 namespace arm_compute
 {
@@ -71,8 +70,6 @@ public:
      * @param[in]     blocking If true, then the mapping will be ready to use by the time
      *                         this method returns, else it is the caller's responsibility
      *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
      */
     void map(cl::CommandQueue &q, bool blocking = true);
     /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
@@ -115,5 +112,5 @@ private:
 };
 
 using ICLImage = ICLTensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLTENSOR_H */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 72cbb3d2b2..8b5bf97099 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_OPENCL_H
-#define ARM_COMPUTE_OPENCL_H
+#ifndef ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
+#define ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
 
 #include <string>
 #include <utility>
@@ -31,8 +31,8 @@
 #ifndef ARM_COMPUTE_NO_EXCEPTIONS
 #define CL_HPP_ENABLE_EXCEPTIONS
 #endif // ARM_COMPUTE_NO_EXCEPTIONS
-#define CL_TARGET_OPENCL_VERSION 200
-#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_TARGET_OPENCL_VERSION      300
+#define CL_HPP_TARGET_OPENCL_VERSION  110
 #define CL_HPP_MINIMUM_OPENCL_VERSION 110
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Weffc++"
@@ -40,8 +40,8 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #if defined(__GNUG__) && __GNUG__ >= 8
 #pragma GCC diagnostic ignored "-Wcatch-value"
-#endif // defined(__GNUG__) && __GNUG__ >= 8
-#include <CL/cl2.hpp>
+#endif                   // defined(__GNUG__) && __GNUG__ >= 8
+#include <CL/opencl.hpp> // include new hpp header instead of cl2.hpp
 #pragma GCC diagnostic pop
 
 namespace cl
@@ -73,25 +73,27 @@ public:
      * @return The static instance of CLSymbols.
      */
     static CLSymbols &get();
-    /** Load symbols from the given OpenCL library path.
+    /** This method attempts to load the OpenCL symbols from the first available library from the provided OpenCL libraries.
      *
-     * @param[in] library Path to the OpenCL library.
+     * @param[in] libraries_filenames Vector containing the filenames of the libraries to be loaded.
+     * @param[in] use_loader          Use symbol loader function loadOpenCLPointer.
      *
-     * @return True if loading the library is successful.
+     * @return True if loading the library is successful. False if all the provided libraries could not be loaded.
      */
-    bool load(const std::string &library);
+    bool load(const std::vector<std::string> &libraries_filenames, bool use_loader = false);
     /** Load symbols from any of the default OpenCL library names.
+     *  If all the default libraries could not be loaded, this method will print a warning message and return false.
      *
      * @return True if loading any library is successful.
      */
     bool load_default();
 
-#define DECLARE_FUNCTION_PTR(func_name) \
-    std::function<decltype(func_name)> func_name##_ptr = nullptr
+#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr
 
     DECLARE_FUNCTION_PTR(clCreateContext);
     DECLARE_FUNCTION_PTR(clCreateContextFromType);
     DECLARE_FUNCTION_PTR(clCreateCommandQueue);
+    DECLARE_FUNCTION_PTR(clCreateCommandQueueWithProperties);
     DECLARE_FUNCTION_PTR(clGetContextInfo);
     DECLARE_FUNCTION_PTR(clBuildProgram);
     DECLARE_FUNCTION_PTR(clEnqueueNDRangeKernel);
@@ -123,6 +125,7 @@ public:
     DECLARE_FUNCTION_PTR(clGetDeviceIDs);
     DECLARE_FUNCTION_PTR(clGetMemObjectInfo);
     DECLARE_FUNCTION_PTR(clRetainEvent);
+    DECLARE_FUNCTION_PTR(clGetPlatformInfo);
     DECLARE_FUNCTION_PTR(clGetPlatformIDs);
     DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo);
     DECLARE_FUNCTION_PTR(clGetCommandQueueInfo);
@@ -135,6 +138,18 @@ public:
     DECLARE_FUNCTION_PTR(clEnqueueMarker);
     DECLARE_FUNCTION_PTR(clWaitForEvents);
     DECLARE_FUNCTION_PTR(clCreateImage);
+    DECLARE_FUNCTION_PTR(clSetKernelExecInfo);
+    DECLARE_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform);
+
+    // Command buffer and mutable dispatch command buffer extensions
+    DECLARE_FUNCTION_PTR(clCreateCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clRetainCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clReleaseCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clFinalizeCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clEnqueueCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clCommandNDRangeKernelKHR);
+
+    DECLARE_FUNCTION_PTR(clUpdateMutableCommandsKHR);
 
     // Third-party extensions
     DECLARE_FUNCTION_PTR(clImportMemoryARM);
@@ -145,4 +160,4 @@ private:
     std::pair<bool, bool> _loaded;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_OPENCL_H */
+#endif // ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
diff --git a/arm_compute/core/CL/gemm/CLGEMMHelpers.h b/arm_compute/core/CL/gemm/CLGEMMHelpers.h
deleted file mode 100644
index dcda732c2d..0000000000
--- a/arm_compute/core/CL/gemm/CLGEMMHelpers.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMHELPERS_H
-#define ARM_COMPUTE_CLGEMMHELPERS_H
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- *
- * @param[in] m              Number of rows (M) in the LHS matrix not reshaped
- * @param[in] n              Number of columns (N) in the RHS matrix not reshaped
- * @param[in] m0             Number of rows processed by each thread/work-item
- * @param[in] n0             Number of columns processed by each thread/work-item
- * @param[in] k0             Number of inner accumulation performed by each thread/work-item
- * @param[in] v0             Number of vertical blocks of size (m0xk0) stored on the same output row
- * @param[in] h0             Number of horizontal blocks of size (k0xn0) stored on the same output row
- * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row
- * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row
- * @param[in] lhs_transpose  True if the (m0xk0) block has to be transposed before been stored
- * @param[in] rhs_transpose  True if the (k0xn0) block has to be transposed before been stored
- *
- * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose);
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMHELPERS_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
deleted file mode 100644
index a6341e5094..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** CLGEMMNative factory class */
-class CLGEMMNativeKernelConfigurationFactory final
-{
-public:
-    /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMNative kernel configuration class
-     */
-    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationMidgard>(gpu);
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
deleted file mode 100644
index 5b2abe6f0f..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Bifrost based OpenCL GEMMNative configuration */
-class CLGEMMNativeKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMNativeKernelConfigurationBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
deleted file mode 100644
index 0e95a15613..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Midgard based OpenCL GEMMNative configuration */
-class CLGEMMNativeKernelConfigurationMidgard final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMNativeKernelConfigurationMidgard(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
deleted file mode 100644
index e739997b3a..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Valhall based OpenCL GEMMNative configuration */
-class CLGEMMNativeKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMNativeKernelConfigurationValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H */
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
deleted file mode 100644
index 10dc9aefdb..0000000000
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** CLGEMMReshaped factory class */
-class CLGEMMReshapedKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshaped kernel configuration class
-     */
-    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
deleted file mode 100644
index 55742e3e56..0000000000
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Bifrost based OpenCL GEMMReshaped configuration */
-class CLGEMMReshapedKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedKernelConfigurationBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H */
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
deleted file mode 100644
index e65974144d..0000000000
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Valhall based OpenCL GEMMReshaped configuration */
-class CLGEMMReshapedKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedKernelConfigurationValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
deleted file mode 100644
index 7909726164..0000000000
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** CLGEMMReshapedOnlyRHS factory class */
-class CLGEMMReshapedOnlyRHSKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshapedOnlyRHS kernel configuration class
-     */
-    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
deleted file mode 100644
index 044bdc7b18..0000000000
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
-class CLGEMMReshapedOnlyRHSKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
deleted file mode 100644
index 6dba6fdb00..0000000000
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */
-class CLGEMMReshapedOnlyRHSKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H */
diff --git a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
deleted file mode 100644
index 58dea3bdae..0000000000
--- a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the absolute difference kernel.
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class CLAbsoluteDifferenceKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~CLAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
-     * @param[out] output Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2          Source tensor. Data types supported: U8/S16.
-     * @param[out] output          Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1. */
-    const ICLTensor *_input2; /**< Source tensor 2. */
-    ICLTensor       *_output; /**< Destination tensor. */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLAccumulateKernel.h b/arm_compute/core/CL/kernels/CLAccumulateKernel.h
deleted file mode 100644
index f639148e25..0000000000
--- a/arm_compute/core/CL/kernels/CLAccumulateKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACCUMULATEKERNEL_H
-#define ARM_COMPUTE_CLACCUMULATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the accumulate kernel.
- *
- * Accumulation is computed by:
- * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
- */
-class CLAccumulateKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  input Source tensor. Data types supported: U8.
-     * @param[out] accum Destination tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *accum);
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] accum           Destination tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
-};
-
-/** Interface for the accumulate weighted kernel.
- *
- * Weighted accumulation is computed:
- * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
- *
- * Where @f$ 0 \le \alpha \le 1 @f$
- * Conceptually, the rounding for this is defined as:
- * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
-*/
-class CLAccumulateWeightedKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     alpha Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     alpha           Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
-};
-
-/** Interface for the accumulate squared kernel.
- *
- * The accumulation of squares is computed:
- * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
- *
- * Where @f$ 0 \le shift \le 15 @f$
-*/
-class CLAccumulateSquaredKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     shift Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     shift           Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLACCUMULATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
deleted file mode 100644
index 1e83a689cd..0000000000
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for the activation layer kernel. */
-class CLActivationLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLActivationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLActivationLayerKernel(const CLActivationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLActivationLayerKernel &operator=(const CLActivationLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLActivationLayerKernel(CLActivationLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLActivationLayerKernel &operator=(CLActivationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLActivationLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output   Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info Activation layer information.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info        Activation layer information.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[in] output   Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info Activation layer information.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h b/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
deleted file mode 100644
index 94e8baed13..0000000000
--- a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel
- *
- * @note The default data type for an uninitialized output tensor is
- *       signed 32-bit integer (S32). It is the user's responsibility to check
- *       that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class CLArgMinMaxLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLArgMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLArgMinMaxLayerKernel(const CLArgMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLArgMinMaxLayerKernel &operator=(const CLArgMinMaxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLArgMinMaxLayerKernel(CLArgMinMaxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLArgMinMaxLayerKernel &operator=(CLArgMinMaxLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLArgMinMaxLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input       Source tensor. Data types supported: S32/F16/F32.
-     * @param[in]  prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                         Has to be nullptr for the first iteration
-     * @param[out] output      Destination tensor. Data types supported: U32/S32
-     *                         Output will have the same number of dimensions as input.
-     * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32.
-     * @param[in]  prev_output     Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                             Has to be nullptr for the first iteration
-     * @param[out] output          Destination tensor. Data types supported: U32/S32
-     *                             Output will have the same number of dimensions as input.
-     * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
-     *
-     * @param[in] input       Source tensor info. Data types supported: S32/F16/F32.
-     * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32
-     *                        Has to be nullptr for the first iteration
-     * @param[in] output      Destination tensor info. Data types supported: U32/S32
-     *                        Output will have the same number of dimensions as input.
-     * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor   *_input;
-    const ICLTensor   *_prev_output;
-    ICLTensor         *_output;
-    unsigned int       _reduction_axis;
-    ReductionOperation _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
deleted file mode 100644
index 163666853c..0000000000
--- a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLBatchConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBatchConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel &operator=(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel(CLBatchConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel &operator=(CLBatchConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     batch_offset    The offset on axis # 3.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int batch_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _batch_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
deleted file mode 100644
index 8eaaca845a..0000000000
--- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the BatchNormalization layer kernel.
- */
-class CLBatchNormalizationLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLBatchNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchNormalizationLayerKernel(const CLBatchNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchNormalizationLayerKernel &operator=(const CLBatchNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLBatchNormalizationLayerKernel(CLBatchNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLBatchNormalizationLayerKernel &operator=(CLBatchNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchNormalizationLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                          The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]      gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
-                   ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                                 3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                                 The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out]     output          Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]      mean            Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      var             Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      beta            (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]      gamma           (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
-     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor       *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_mean;
-    const ICLTensor *_var;
-    const ICLTensor *_beta;
-    const ICLTensor *_gamma;
-    float            _epsilon;
-    bool             _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
deleted file mode 100644
index 2b12ad094a..0000000000
--- a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the batch to space kernel */
-class CLBatchToSpaceLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBatchToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchToSpaceLayerKernel(const CLBatchToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchToSpaceLayerKernel &operator=(const CLBatchToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBatchToSpaceLayerKernel(CLBatchToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBatchToSpaceLayerKernel &operator=(CLBatchToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
-    /** Initialise the kernel's inputs and output (Static block shape).
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
-    /** Initialise the kernel's inputs and output (Static block shape).
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x   Block shape x value.
-     * @param[in]  block_shape_y   Block shape y value.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel (Static block shape).
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    const ICLTensor *_block_shape; /**< Block shape tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
deleted file mode 100644
index 8defe32862..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISEANDKERNEL_H
-#define ARM_COMPUTE_CLBITWISEANDKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise AND operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
- */
-class CLBitwiseAndKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLBitwiseAndKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseAndKernel(const CLBitwiseAndKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseAndKernel &operator=(const CLBitwiseAndKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBitwiseAndKernel(CLBitwiseAndKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBitwiseAndKernel &operator=(CLBitwiseAndKernel &&) = default;
-    /** Set the inputs and output images
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8.
-     * @param[in]  input2 Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8.
-     * @param[in]  input2          Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISEANDKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
deleted file mode 100644
index b86ce7f173..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISENOTKERNEL_H
-#define ARM_COMPUTE_CLBITWISENOTKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise NOT operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = \lnot input(x,y) @f]
- */
-class CLBitwiseNotKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the inputs and output images.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISENOTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
deleted file mode 100644
index 65eb50f0fd..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISEORKERNEL_H
-#define ARM_COMPUTE_CLBITWISEORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise OR operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
- */
-class CLBitwiseOrKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLBitwiseOrKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseOrKernel(const CLBitwiseOrKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseOrKernel &operator=(const CLBitwiseOrKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBitwiseOrKernel(CLBitwiseOrKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBitwiseOrKernel &operator=(CLBitwiseOrKernel &&) = default;
-    /** Set the inputs and output images
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8.
-     * @param[in]  input2 Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8.
-     * @param[in]  input2          Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISEORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
deleted file mode 100644
index 5c63a7f22c..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISEXORKERNEL_H
-#define ARM_COMPUTE_CLBITWISEXORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise XOR operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
- */
-class CLBitwiseXorKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLBitwiseXorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseXorKernel(const CLBitwiseXorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseXorKernel &operator=(const CLBitwiseXorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBitwiseXorKernel(CLBitwiseXorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBitwiseXorKernel &operator=(CLBitwiseXorKernel &&) = default;
-    /** Set the inputs and output images
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8.
-     * @param[in]  input2 Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8.
-     * @param[in]  input2          Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISEXORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h b/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
deleted file mode 100644
index bbe11562ed..0000000000
--- a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bounding box kernel */
-class CLBoundingBoxTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBoundingBoxTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBoundingBoxTransformKernel(const CLBoundingBoxTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBoundingBoxTransformKernel &operator=(const CLBoundingBoxTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBoundingBoxTransformKernel(CLBoundingBoxTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBoundingBoxTransformKernel &operator=(CLBoundingBoxTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLBoundingBoxTransformKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                        Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
-     * @param[in]  info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  boxes           Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[out] pred_boxes      Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in]  deltas          Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                             Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
-     * @param[in]  info            Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
-     *
-     * @param[in] boxes      Source tensor info. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[in] pred_boxes Destination tensor info. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in] deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                       Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
-     * @param[in] info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_boxes;
-    ICLTensor       *_pred_boxes;
-    const ICLTensor *_deltas;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
deleted file mode 100644
index ea3c1c1f3e..0000000000
--- a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOX3X3KERNEL_H
-#define ARM_COMPUTE_CLBOX3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the box 3x3 filter kernel.
- *
- */
-class CLBox3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    //Inherited methods overriden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBOX3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
deleted file mode 100644
index 40ad4dcd84..0000000000
--- a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-#define ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Gradient computation.
- */
-class CLGradientKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGradientKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel(const CLGradientKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel &operator=(const CLGradientKernel &) = delete;
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  gx        Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy        Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase     Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy              Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude       Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase           Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type       Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Source tensor - Gx component */
-    const ICLTensor *_gy;        /**< Source tensor - Gy component */
-    ICLTensor       *_magnitude; /**< Destination tensor - Magnitude */
-    ICLTensor       *_phase;     /**< Destination tensor - Quantized phase */
-};
-
-/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge.
- *
- * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
- *       to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed.
- *
- * @note Hysteresis is computed in @ref CLEdgeTraceKernel
- */
-class CLEdgeNonMaxSuppressionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeNonMaxSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */
-    const ICLTensor *_phase;     /**< Source tensor - Quantized phase. */
-    ICLTensor       *_output;    /**< Destination tensor. */
-};
-
-/** OpenCL kernel to perform Edge tracing.
- */
-class CLEdgeTraceKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeTraceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     input            Source tensor. Data types supported: U8.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     compile_context  The compile context to be used.
-     * @param[in]     input            Source tensor. Data types supported: U8.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;            /**< Source tensor. */
-    ICLTensor       *_output;           /**< Destination tensor. */
-    int32_t          _lower_thr;        /**< Lower threshold used for the hysteresis. */
-    int32_t          _upper_thr;        /**< Upper threshold used for the hysteresis. */
-    ICLTensor       *_visited;          /**< Marks visited elements */
-    ICLTensor       *_recorded;         /**< Marks recorded elements */
-    ICLTensor       *_l1_stack;         /**< L1 hysteris stack */
-    ICLTensor       *_l1_stack_counter; /**< L1 hysteris stack counter */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCANNYEDGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
deleted file mode 100644
index 32ddf152c3..0000000000
--- a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-#define ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <array>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel combine kernel */
-class CLChannelCombineKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelCombineKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel(const CLChannelCombineKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel(CLChannelCombineKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelCombineKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output The single planar output tensor.
-     */
-    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3          The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output          The single planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output The multi planar output tensor.
-     */
-    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output          The multi planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    std::array<const ICLTensor *, 4> _planes;
-    ICLTensor     *_output;
-    ICLMultiImage *_output_multi;
-    std::array<uint32_t, 3> _x_subsampling;
-    std::array<uint32_t, 3> _y_subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
deleted file mode 100644
index 6a0c4bb94e..0000000000
--- a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-#define ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel extract kernel */
-class CLChannelExtractKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelExtractKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel(const CLChannelExtractKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel(CLChannelExtractKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelExtractKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Must be of U8 format.
-     */
-    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Destination tensor. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    uint32_t         _num_elems_processed_per_iteration;
-    uint32_t         _subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
deleted file mode 100644
index 14b59d325f..0000000000
--- a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
-#define ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the channel shuffle kernel */
-class CLChannelShuffleLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelShuffleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelShuffleLayerKernel(const CLChannelShuffleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelShuffleLayerKernel &operator=(const CLChannelShuffleLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelShuffleLayerKernel(CLChannelShuffleLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelShuffleLayerKernel &operator=(CLChannelShuffleLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelShuffleLayerKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  input      Input tensor. Data types supported: All.
-     * @param[out] output     Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
-     *
-     * @param[in] input      Input tensor info. Data types supported: All.
-     * @param[in] output     Output tensor info. Data type supported: Same as @p input
-     * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
deleted file mode 100644
index d0528ed21a..0000000000
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOL2IMKERNEL_H
-#define ARM_COMPUTE_CLCOL2IMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class CLCol2ImKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCol2ImKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel(const CLCol2ImKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel(CLCol2ImKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
-    /** Default destructor */
-    ~CLCol2ImKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims Output convolved dimensions.
-     * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                             while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims  Output convolved dimensions.
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
-     *
-     * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in] convolved_dims Output convolved dimensions.
-     * @param[in] num_groups     (Optional) Number of groups when performing a grouped convolution
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _convolved_dims;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCOL2IMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
deleted file mode 100644
index 2bcd141863..0000000000
--- a/arm_compute/core/CL/kernels/CLColorConvertKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-#define ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the color convert kernel.
- *
- */
-class CLColorConvertKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLColorConvertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel(const CLColorConvertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel(CLColorConvertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default;
-    /** Default destructor. */
-    ~CLColorConvertKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output          Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output          Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const ICLMultiImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor     *_input;        /*pointer to single planar tensor input */
-    ICLTensor           *_output;       /*pointer to single planar tensor output */
-    const ICLMultiImage *_multi_input;  /*pointer to multi-planar input */
-    ICLMultiImage       *_multi_output; /*pointer to multi-planar output */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOLORCONVERTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLComparisonKernel.h b/arm_compute/core/CL/kernels/CLComparisonKernel.h
deleted file mode 100644
index d5c5297c61..0000000000
--- a/arm_compute/core/CL/kernels/CLComparisonKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
-#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the comparison kernel. */
-class CLComparisonKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLComparisonKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComparisonKernel(const CLComparisonKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComparisonKernel &operator=(const CLComparisonKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLComparisonKernel(CLComparisonKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLComparisonKernel &operator=(CLComparisonKernel &&) = default;
-    /** Default destructor */
-    ~CLComparisonKernel() = default;
-    /** Set the inputs and output tensors
-     *
-     * @param[in]  input1    Source tensor. Data types supported: All.
-     * @param[in]  input2    Source tensor. Data types supported: Same as @p input1.
-     * @param[out] output    Destination tensor. Data types supported: U8.
-     * @param[in]  operation Comparison operation to use.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
-    /** Set the inputs and output tensors
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: All.
-     * @param[in]  input2          Source tensor. Data types supported: Same as @p input1.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     * @param[in]  operation       Comparison operation to use.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
-     *
-     * @param[in] input1    Source tensor. Data types supported: All.
-     * @param[in] input2    Source tensor. Data types supported: Same as @p input1.
-     * @param[in] output    Destination tensor. Data types supported: U8.
-     * @param[in] operation Comparison operation to use.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOMPARISONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index d3e57a6738..0000000000
--- a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class CLConvertFullyConnectedWeightsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLConvertFullyConnectedWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvertFullyConnectedWeightsKernel(const CLConvertFullyConnectedWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvertFullyConnectedWeightsKernel &operator=(const CLConvertFullyConnectedWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvertFullyConnectedWeightsKernel(CLConvertFullyConnectedWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvertFullyConnectedWeightsKernel &operator=(CLConvertFullyConnectedWeightsKernel &&) = default;
-    /** Default destructor */
-    ~CLConvertFullyConnectedWeightsKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Set the input and output tensor.
-     *
-     * @param[in]  compile_context      The compile context to be used.
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel
-     *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
-     * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLConvolutionKernel.h b/arm_compute/core/CL/kernels/CLConvolutionKernel.h
deleted file mode 100644
index b6fe51dbaa..0000000000
--- a/arm_compute/core/CL/kernels/CLConvolutionKernel.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class CLConvolutionKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a 3x3 convolution to a tensor. */
-using CLConvolution3x3Kernel = CLConvolutionKernel<3>;
-/** Interface for the kernel which applies a 5x5 convolution to a tensor. */
-using CLConvolution5x5Kernel = CLConvolutionKernel<5>;
-/** Interface for the kernel which applies a 7x7 convolution to a tensor. */
-using CLConvolution7x7Kernel = CLConvolutionKernel<7>;
-/** Interface for the kernel which applies a 9x9 convolution to a tensor. */
-using CLConvolution9x9Kernel = CLConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default Constructor */
-    CLSeparableConvolutionHorKernel();
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-
-/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: S16.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: S16.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class CLConvolutionRectangleKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVOLUTIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLCopyKernel.h b/arm_compute/core/CL/kernels/CLCopyKernel.h
deleted file mode 100644
index 05dff8ed0c..0000000000
--- a/arm_compute/core/CL/kernels/CLCopyKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOPYKERNEL_H
-#define ARM_COMPUTE_CLCOPYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a copy between two tensors */
-class CLCopyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCopyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCopyKernel(const CLCopyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCopyKernel &operator=(const CLCopyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCopyKernel(CLCopyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCopyKernel &operator=(CLCopyKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input         Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[out] output        Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding       (Optional) Padding to be applied to the input tensor
-     * @param[in]  output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[out] output          Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding         (Optional) Padding to be applied to the input tensor
-     * @param[in]  output_window   (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLCopyKernel
-     *
-     * @param[in] input         Source tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[in] output        Destination tensor info. Data types supported: same as @p input.
-     * @param[in] padding       (Optional) Padding to be applied to the input tensor
-     * @param[in] output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Window           _output_window;
-    bool             _has_output_window;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCOPYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLCropKernel.h b/arm_compute/core/CL/kernels/CLCropKernel.h
deleted file mode 100644
index cbc2338940..0000000000
--- a/arm_compute/core/CL/kernels/CLCropKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCROPKERNEL_H
-#define ARM_COMPUTE_CLCROPKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a copy between two tensors */
-class CLCropKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCropKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCropKernel(const CLCropKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCropKernel &operator=(const CLCropKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCropKernel(CLCropKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCropKernel &operator=(CLCropKernel &&) = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
-     * @param[out] output              Destination tensor. Data type supported: F32
-     * @param[in]  start               Coordinates of where to start cropping the image.
-     * @param[in]  end                 Coordinates of where to end cropping the image.
-     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p input.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *output_window = nullptr);
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
-     * @param[out] output              Destination tensor. Data type supported: F32
-     * @param[in]  start               Coordinates of where to start cropping the image.
-     * @param[in]  end                 Coordinates of where to end cropping the image.
-     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p input.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *output_window = nullptr);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in] input               Source tensor info. Data type supported: All. Data layouts supported: NHWC.
-     * @param[in] output              Destination tensor info. Data type supported: F32
-     * @param[in] start               Coordinates of where to start cropping the image.
-     * @param[in] end                 Coordinates of where to end cropping the image.
-     * @param[in] batch_index         Fourth dimension index of the 3D image to crop in @p input.
-     * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in] output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *output_window = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Coordinates2D    _start;
-    uint32_t         _batch_index;
-    float            _extrapolation_value;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCROPKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
deleted file mode 100644
index 0c65f519cc..0000000000
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
-#define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Deconvolution layer kernel on OpenCL.
- */
-class CLDeconvolutionLayerUpsampleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLDeconvolutionLayerUpsampleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionLayerUpsampleKernel(const CLDeconvolutionLayerUpsampleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionLayerUpsampleKernel &operator=(const CLDeconvolutionLayerUpsampleKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDeconvolutionLayerUpsampleKernel(CLDeconvolutionLayerUpsampleKernel &&) = default;
-    /** Default move assignment operator */
-    CLDeconvolutionLayerUpsampleKernel &operator=(CLDeconvolutionLayerUpsampleKernel &&) = default;
-    /** Default destructor */
-    ~CLDeconvolutionLayerUpsampleKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: All.
-     * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info   Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: All.
-     * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
-     *
-     * @param[in] input  Source tensor info. Data types supported: All.
-     * @param[in] output Destination tensor info. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] info   Contains padding and stride information described in @ref PadStrideInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    PadStrideInfo    _info;
-    DataLayout       _data_layout;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
deleted file mode 100644
index 292c561e46..0000000000
--- a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
-#define ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the OpenCL kernel to be used for reshaping the tensor before returning the result of deconvolution.
- *
- * The input tensor to this OpenCL kernel is expected to be the result of a @ref CLGEMM operation between the Deconvolution input and the Deconvolution filter.
- *
- * The input tensor should have the following shape: [filter_width * filter_height * ofms, width, height, batch_size]
- *
- * The output tensor should have the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
- *
- * For example, given a tensor with dimensions [4, 2, 2] this function returns a tensor with dimensions [1, 4, 4].
- *
- */
-class CLDeconvolutionReshapeOutputKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLDeconvolutionReshapeOutputKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionReshapeOutputKernel(const CLDeconvolutionReshapeOutputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionReshapeOutputKernel &operator=(const CLDeconvolutionReshapeOutputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDeconvolutionReshapeOutputKernel(CLDeconvolutionReshapeOutputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDeconvolutionReshapeOutputKernel &operator=(CLDeconvolutionReshapeOutputKernel &&) = default;
-    /** Default destructor */
-    ~CLDeconvolutionReshapeOutputKernel() = default;
-
-    /** Initialise the kernel's source and destination.
-     *
-     * @param[in]  input        Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  bias         Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[out] output       Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
-     *                          Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  input_info   Deconvolution input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
-    /** Initialise the kernel's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  bias            Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[out] output          Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
-     *                             Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  input_info      Deconvolution input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
-                   const PadStrideInfo &deconv_info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
-     *
-     * @param[in] input        GEMM output tensor info to be reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in] bias         (Optional) Optional bias tensor info to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] output       Reshaped output tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] input_info   Original input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] weights_info Original weights tensor info output. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool             _add_bias;
-    const ICLTensor *_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
deleted file mode 100644
index 5fe826d090..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLDepthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel &operator=(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel(CLDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset    The offset on the Z axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _depth_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
deleted file mode 100644
index 66eb6222b2..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth conversion kernel. */
-class CLDepthConvertLayerKernel : public ICLSimple3DKernel
-{
-public:
-    /** Set the input and output of the kernel.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy Conversion policy
-     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
-    /** Set the input and output of the kernel.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] output          The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy
-     * @param[in]  shift           Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in] policy Conversion policy
-     * @param[in] shift  Value for down/up conversions. Must be 0 <= shift < 8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
deleted file mode 100644
index 87ac3c1ec1..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth to space kernel */
-class CLDepthToSpaceLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthToSpaceLayerKernel(const CLDepthToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthToSpaceLayerKernel &operator=(const CLDepthToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthToSpaceLayerKernel(CLDepthToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthToSpaceLayerKernel &operator=(CLDepthToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDepthToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape value.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape     Block shape value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-    int32_t          _block_shape; /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
deleted file mode 100644
index 6cf0326467..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NCHW.
- */
-class CLDepthwiseConvolutionLayer3x3NCHWKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel();
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel
-     *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [3, 3, IFM].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] gpu_target         (Optional) GPU target to validate the kernel for. Defaults to midgard.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD,
-                           const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int _conv_stride_x;
-    unsigned int _conv_pad_top;
-    unsigned int _conv_pad_left;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
deleted file mode 100644
index e564cf6fe0..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NHWC.
- */
-class CLDepthwiseConvolutionLayer3x3NHWCKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel();
-    /** Default move assignment operator. */
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-     *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int _num_rows_processed_per_iteration;
-    unsigned int _num_planes_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
deleted file mode 100644
index 8847cf9c46..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a MxN depthwise convolution. M and N are respectively the rows and columns of the filter
-    This kernel assumes that tensor for the weights is NOT reshaped (Native version) */
-class CLDepthwiseConvolutionLayerNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLDepthwiseConvolutionLayerNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerNativeKernel(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerNativeKernel &operator=(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Initialize the function's source, destination and parameters
-     *
-     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialize the function's source, destination and parameters
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
-     *
-     * @param[in] input              Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in] dwc_info           Depthwise convolution layer info
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-    unsigned int     _depth_multiplier;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
deleted file mode 100644
index 8dc5d32e4f..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to reshape the weights of depthwise convolution. */
-class CLDepthwiseConvolutionLayerReshapeWeightsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
-    /** Default move assignment operator */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
-
-    /** Initialize the function's source and destination.
-     *
-     * @param[in]  input  The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[out] output The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in]  info   Depthwise convolution information to reshape the input tensor.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
-    /** Initialize the function's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[out] output          The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in]  info            Depthwise convolution information to reshape the input tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-     *
-     * @param[in] input  The input tensor info of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[in] output The output tensor info of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in] info   Depthwise convolution information to reshape the input tensor.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-
-    void configure_dot_product(const DepthwiseConvolutionReshapeInfo &info);
-    void configure_generic(const DepthwiseConvolutionReshapeInfo &info);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
deleted file mode 100644
index bb154f1a5b..0000000000
--- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the dequantization layer kernel. */
-class CLDequantizationLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDequantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDequantizationLayerKernel(const CLDequantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDequantizationLayerKernel &operator=(const CLDequantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDequantizationLayerKernel(CLDequantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLDequantizationLayerKernel &operator=(CLDequantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDequantizationLayerKernel() = default;
-    /** Set the input, output, min and max.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output Destination tensor. Data types supported: F16/F32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input, output, min and max.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output          Destination tensor. Data types supported: F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] output Output tensor info. Data types supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDerivativeKernel.h b/arm_compute/core/CL/kernels/CLDerivativeKernel.h
deleted file mode 100644
index cd8ae90c2d..0000000000
--- a/arm_compute/core/CL/kernels/CLDerivativeKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-#define ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the derivative kernel. */
-class CLDerivativeKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDerivativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel(const CLDerivativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel(CLDerivativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default;
-    /** Default destructor */
-    ~CLDerivativeKernel() = default;
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;            /**< Input tensor */
-    ICLTensor       *_output_x;         /**< Output tensor - Derivate along the X direction */
-    ICLTensor       *_output_y;         /**< Output tensor - Derivate along the Y direction */
-    bool             _run_derivative_x; /**< Do we need to run Derivative X ? */
-    bool             _run_derivative_y; /**< Do we need to run Derivative Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDERIVATIVEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDilateKernel.h b/arm_compute/core/CL/kernels/CLDilateKernel.h
deleted file mode 100644
index 45f5fe0764..0000000000
--- a/arm_compute/core/CL/kernels/CLDilateKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDILATEKERNEL_H
-#define ARM_COMPUTE_CLDILATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the dilate kernel.
- *
- */
-class CLDilateKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDILATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
deleted file mode 100644
index 489d7c27c5..0000000000
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the  direct convolution kernel.
- */
-class CLDirectConvolutionLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDirectConvolutionLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerKernel(const CLDirectConvolutionLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerKernel &operator=(const CLDirectConvolutionLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerKernel(CLDirectConvolutionLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerKernel &operator=(CLDirectConvolutionLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDirectConvolutionLayerKernel() = default;
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2, data_layout=NHWC
-     *
-     * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                       Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2, data_layout=NHWC
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                             Data type supported:Same as @p input.
-     * @param[in]  biases          Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] output          Output tensor.
-     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel
-     *
-     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                      Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] target    Target GPU architecture.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    const ICLTensor *_weights;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    BorderSize       _border_size;
-    int              _conv_stride_x;
-    int              _conv_stride_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h b/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
deleted file mode 100644
index e190bdebbe..0000000000
--- a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
-#define ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the elementwise unary operator */
-class CLElementWiseUnaryLayerKernel : public ICLSimpleKernel
-{
-public:
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  input  First tensor input. Data types supported: F16/F32.
-     * @param[out] output Output tensor. Data types supported: Same as @p input.
-     * @param[in]  op     Element wise unary operation to perform.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op);
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First tensor input. Data types supported: F16/F32.
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     * @param[in]  op              Element wise unary operation to perform.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLElementWiseUnaryLayerKernel
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     * @param[in] op     Element wise unary operation to perform.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
deleted file mode 100644
index 4d3d4bc834..0000000000
--- a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
-#define ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
- *
- */
-class CLElementwiseOperationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLElementwiseOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLElementwiseOperationKernel(const CLElementwiseOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLElementwiseOperationKernel &operator=(const CLElementwiseOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLElementwiseOperationKernel(CLElementwiseOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLElementwiseOperationKernel &operator=(CLElementwiseOperationKernel &&) = default;
-    /** Default destructor */
-    ~CLElementwiseOperationKernel() = default;
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-    BorderSize border_size() const override;
-
-protected:
-    /** The name of the operation */
-    virtual std::string name() = 0;
-
-    /** Initialise the kernel's output.
-     *
-     * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     *
-     * @return a pair of Status and Window
-     */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) = 0;
-
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     */
-    virtual Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) = 0;
-
-    /** Generate the build options for the specific kernel
-     *
-     * @reutrn a CLBuildOptions struct
-     */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) = 0;
-
-    /** Generate the identifier for tuning
-     *
-     * @reutrn a string
-     */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) = 0;
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    ActivationLayerInfo _act_info;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-
-/** Addition operation */
-class CLSaturatedArithmeticOperationKernel : public CLElementwiseOperationKernel
-{
-public:
-    CLSaturatedArithmeticOperationKernel()
-        : CLElementwiseOperationKernel(), _policy(), _op()
-    {
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
-     * @param[in] policy   Policy to use to handle overflow.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
-     * @param[in] policy          Policy to use to handle overflow.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
-     * @param[in] policy   Policy to use to handle overflow.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
-
-private:
-    ConvertPolicy       _policy;
-    ArithmeticOperation _op;
-};
-
-class CLArithmeticOperationKernel : public CLElementwiseOperationKernel
-{
-public:
-    CLArithmeticOperationKernel()
-        : CLElementwiseOperationKernel(), _op()
-    {
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
-
-private:
-    ArithmeticOperation _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLErodeKernel.h b/arm_compute/core/CL/kernels/CLErodeKernel.h
deleted file mode 100644
index cbc748194c..0000000000
--- a/arm_compute/core/CL/kernels/CLErodeKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLERODEKERNEL_H
-#define ARM_COMPUTE_CLERODEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the erode kernel.
- *
- */
-class CLErodeKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLERODEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h b/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
deleted file mode 100644
index a8da1246bb..0000000000
--- a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
-#define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the digit reverse operation kernel. */
-class CLFFTDigitReverseKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLFFTDigitReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTDigitReverseKernel(const CLFFTDigitReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTDigitReverseKernel &operator=(const CLFFTDigitReverseKernel &) = delete;
-    /** Default Move Constructor. */
-    CLFFTDigitReverseKernel(CLFFTDigitReverseKernel &&) = default;
-    /** Default move assignment operator */
-    CLFFTDigitReverseKernel &operator=(CLFFTDigitReverseKernel &&) = default;
-    /** Default destructor */
-    ~CLFFTDigitReverseKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F32.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
-     * @param[in]  config Kernel configuration.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F32.
-     * @param[out] output          Destination tensor. Data type supported: same as @p input
-     * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
-     * @param[in]  config          Kernel configuration.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32.
-     * @param[in] output Destination tensor info. Data type supported: same as @p input
-     * @param[in] idx    Digit reverse index tensor info. Data type supported: U32
-     * @param[in] config Kernel configuration.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_idx;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h b/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
deleted file mode 100644
index e3f53462d9..0000000000
--- a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
-#define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-#include <set>
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the FFT radix stage kernel. */
-class CLFFTRadixStageKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLFFTRadixStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTRadixStageKernel(const CLFFTRadixStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTRadixStageKernel &operator=(const CLFFTRadixStageKernel &) = delete;
-    /** Default Move Constructor. */
-    CLFFTRadixStageKernel(CLFFTRadixStageKernel &&) = default;
-    /** Default move assignment operator */
-    CLFFTRadixStageKernel &operator=(CLFFTRadixStageKernel &&) = default;
-    /** Default destructor */
-    ~CLFFTRadixStageKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is nullptr, the FFT will be performed in-place
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32.
-     * @param[out]    output Destination tensor. Can be nullptr. Data type supported: same as @p input
-     * @param[in]     config FFT descriptor metadata.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is nullptr, the FFT will be performed in-place
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] input           Source tensor. Data types supported: F32.
-     * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
-     * @param[in]     config          FFT descriptor metadata.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32.
-     * @param[in] output Destination tensor info. Can be nullptr. Data type supported: same as @p input
-     * @param[in] config FFT descriptor metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
-    /** Returns the radix that are support by the FFT kernel
-     *
-     * @return A set of supported radix
-     */
-    static std::set<unsigned int> supported_radix();
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h b/arm_compute/core/CL/kernels/CLFFTScaleKernel.h
deleted file mode 100644
index d0d2b7613c..0000000000
--- a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
-#define ARM_COMPUTE_CLFFTSCALEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the inverse fft scale kernel. */
-class CLFFTScaleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLFFTScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTScaleKernel(const CLFFTScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTScaleKernel &operator=(const CLFFTScaleKernel &) = delete;
-    /** Default Move Constructor. */
-    CLFFTScaleKernel(CLFFTScaleKernel &&) = default;
-    /** Default move assignment operator */
-    CLFFTScaleKernel &operator=(CLFFTScaleKernel &&) = default;
-    /** Default destructor */
-    ~CLFFTScaleKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32.
-     * @param[out]    output Destination tensor. Data type supported: same as @p input
-     * @param[in]     config Kernel configuration
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
-    /** Set the input and output tensors.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] input           Source tensor. Data types supported: F32.
-     * @param[out]    output          Destination tensor. Data type supported: same as @p input
-     * @param[in]     config          Kernel configuration
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32.
-     * @param[in] output Destination tensor info. Data type supported: same as @p input
-     * @param[in] config Kernel configuration
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFFTSCALEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFastCornersKernel.h b/arm_compute/core/CL/kernels/CLFastCornersKernel.h
deleted file mode 100644
index 1a0d4e36a5..0000000000
--- a/arm_compute/core/CL/kernels/CLFastCornersKernel.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-#define ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** CL kernel to perform fast corners */
-class CLFastCornersKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFastCornersKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel(const CLFastCornersKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel(CLFastCornersKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default;
-    /** Default destructor */
-    ~CLFastCornersKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    ICLImage       *_output;
-};
-
-/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */
-class CLCopyToArrayKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCopyToArrayKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default;
-    /** Default destructor */
-    ~CLCopyToArrayKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input         Source image. Data types supported: U8.
-     * @param[in]  update_number Flag to indicate whether we need to update the number of corners
-     * @param[out] corners       Array of keypoints to store the results.
-     * @param[out] num_buffers   Number of keypoints to store the results.
-     */
-    void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[in]  update_number   Flag to indicate whether we need to update the number of corners
-     * @param[out] corners         Array of keypoints to store the results.
-     * @param[out] num_buffers     Number of keypoints to store the results.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage   *_input;      /**< source image */
-    ICLKeyPointArray *_corners;    /**< destination array */
-    cl::Buffer       *_num_buffer; /**< CL memory to record number of key points in the array */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLFASTCORNERSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
deleted file mode 100644
index d00ea55a83..0000000000
--- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFILLBORDERKERNEL_H
-#define ARM_COMPUTE_CLFILLBORDERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for filling the border of a kernel */
-class CLFillBorderKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFillBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFillBorderKernel(const CLFillBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFillBorderKernel &operator=(const CLFillBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFillBorderKernel(CLFillBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFillBorderKernel &operator=(CLFillBorderKernel &&) = default;
-    /** Default destructor */
-    ~CLFillBorderKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-
-    /** Function to set the constant value on fill border kernel depending on type.
-     *
-     * @param[in] idx                   Index of the kernel argument to set.
-     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    template <class T>
-    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    bool is_parallelisable() const override;
-
-private:
-    ICLTensor *_tensor;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFILLBORDERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
deleted file mode 100644
index ab009e1aa8..0000000000
--- a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
-#define ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL interface for the flatten kernel.*/
-class CLFlattenLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFlattenLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFlattenLayerKernel(const CLFlattenLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFlattenLayerKernel &operator=(const CLFlattenLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFlattenLayerKernel(CLFlattenLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFlattenLayerKernel &operator=(CLFlattenLayerKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First input tensor to flatten with at least 3 dimensions.
-     *                             The dimensions above the third will be interpreted as batches. Data types supported: All.
-     * @param[out] output          Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFlattenLayerKernel
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFLATTENLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFloorKernel.h b/arm_compute/core/CL/kernels/CLFloorKernel.h
deleted file mode 100644
index 4d1ed789db..0000000000
--- a/arm_compute/core/CL/kernels/CLFloorKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFLOORKERNEL_H
-#define ARM_COMPUTE_CLFLOORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a floor operation */
-class CLFloorKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFloorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFloorKernel(const CLFloorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFloorKernel &operator=(const CLFloorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFloorKernel(CLFloorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFloorKernel &operator=(CLFloorKernel &&) = default;
-    /** Default destructor */
-    ~CLFloorKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: F16/F32.
-     * @param[out] output Destination tensor. Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: F16/F32.
-     * @param[out] output          Destination tensor. Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFloorKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: F16/F32.
-     * @param[in] output Destination tensor info. Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFLOORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
deleted file mode 100644
index 2fe6b223ca..0000000000
--- a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel to fuse the batch normalization node to a preceding convolution node */
-class CLFuseBatchNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFuseBatchNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFuseBatchNormalizationKernel(const CLFuseBatchNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFuseBatchNormalizationKernel &operator=(const CLFuseBatchNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFuseBatchNormalizationKernel(CLFuseBatchNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFuseBatchNormalizationKernel &operator=(CLFuseBatchNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLFuseBatchNormalizationKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
-     * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
-     * @param[out] fused_weights Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[out] fused_bias    Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in]  input_bias    (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in]  bn_beta       (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_beta is set to 0.0
-     * @param[in]  bn_gamma      (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_gamma is set to 1.0
-     * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input_weights   Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in]  bn_mean         Batch normalization layer mean tensor. Same as @p input_weights
-     * @param[in]  bn_var          Batch normalization layer variance tensor. Same as @p input_weights
-     * @param[out] fused_weights   Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[out] fused_bias      Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in]  input_bias      (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in]  bn_beta         (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                             @note if nullptr, bn_beta is set to 0.0
-     * @param[in]  bn_gamma        (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                             @note if nullptr, bn_gamma is set to 1.0
-     * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
-     *
-     * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in] bn_mean       Batch normalization layer mean tensor info. Same as @p input_weights
-     * @param[in] bn_var        Batch normalization layer variance tensor info. Same as @p input_weights
-     * @param[in] fused_weights Output fused weights tensor info. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[in] fused_bias    Output fused bias tensor info. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in] input_bias    (Optional) Input bias tensor info for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in] bn_beta       (Optional) Batch normalization layer beta tensor info. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_beta is set to 0.0
-     * @param[in] bn_gamma      (Optional) Batch normalization layer gamma tensor info. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_gamma is set to 1.0
-     * @param[in] epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in] fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input_weights;
-    const ICLTensor *_input_bias;
-    const ICLTensor *_bn_mean;
-    const ICLTensor *_bn_var;
-    const ICLTensor *_bn_gamma;
-    const ICLTensor *_bn_beta;
-    ICLTensor       *_fused_weights;
-    ICLTensor       *_fused_bias;
-    float            _epsilon;
-    bool             _run_in_place_weights;
-    bool             _run_in_place_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 15fd20842e..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
-class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same as lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 43526b7c41..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel
-     *
-     * @param[in] input0    Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index 1aba6c0398..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
- */
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input0             Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1             Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output             Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in] gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                               Only the following values are supported for LHS info:
-     *                               lhs_info.m0: 2,3,4,5,6,7,8
-     *                               lhs_info.k0: 2,3,4,8,16
-     *                               Only the following values are supported for RHS info:
-     *                               rhs_info.n0: 2,3,4,8,16
-     *                               rhs_info.k0: same as lhs_info.k0
-     *                               rhs_info.transpose: true
-     * @param[in] vector_sum_col     (Optional) Input row-vector info of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in] vector_sum_row     (Optional) Input row-vector info of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in] bias               (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     * @param[in] output_shifts      (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr,
-                           const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr,
-                           const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _is_quantized_per_channel;
-    bool             _fuse_output_stage;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index bc982c6120..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class CLGEMMLowpOffsetContributionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
-     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
-     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k               Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                   int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    ICLTensor       *_mm_result;
-    const ICLTensor *_bias;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 583b388d45..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
- * of matrix A and matrix B and performs the output stage defined by the output_stage argument
- *
- * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
- */
-class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k,
-                   int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result          Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] a_offset           Offset to be added to each element of the matrix A.
-     * @param[in] b_offset           Offset to be added to each element of the matrix B.
-     * @param[in] output_stage       GEMMLowp output stage info
-     * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     * @param[in] output_shifts      Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_mm_result;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized_per_channel;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
deleted file mode 100644
index 1e9fde8376..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: S32
-     * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info   Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index 766ef9a820..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8/SIGNED.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage    GEMMLowp output stage metadata.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] input        Input tensor. Data type supported: S32
-     * @param[in] bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                         Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor               *_input;
-    const ICLTensor               *_bias;
-    ICLTensor                     *_output;
-    const GEMMLowpOutputStageInfo *_output_stage;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
deleted file mode 100644
index 6f58150037..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor info. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
deleted file mode 100644
index 0c237be34c..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
deleted file mode 100644
index cb3e12e34d..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- */
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
deleted file mode 100644
index 857b1c7952..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all OpenCL reduction kernels */
-class ICLGEMMLowpReductionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    ICLGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: S8
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S8
-     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index df2f6f4ad1..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface to add a bias to each row of the input tensor
- *
- */
-class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixAccumulateBiasesKernel &operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(ICLTensor *accum, const ICLTensor *biases);
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] accum           The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases          The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *accum, const ICLTensor *biases);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAccumulateBiasesKernel
-     *
-     * @param[in] accum      The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in] biases     The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     * @param[in] gpu_target GPU target
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor       *_accum;
-    const ICLTensor *_biases;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 6085b34bcb..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result.
- *  For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object
- *
- * @note If the input tensors @p input0 and @p input1 have been reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel" and @ref CLGEMMReshapeRHSMatrixKernel,
- *       the flag @p is_interleaved_transposed must be set to true
- *
- * @attention @p input1 tensor must have at least 2 dimensions (matrix)
- *
- */
-class CLGEMMMatrixMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyKernel(const CLGEMMMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyKernel &operator=(const CLGEMMMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyKernel(CLGEMMMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  compile_context           The compile context to be used.
-     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0                    Input tensor containing the Matrix A info. Data types supported: F16/F32
-     * @param[in] input1                    Input tensor containing the Matrix B info. Data type supported: same as @p input0
-     * @param[in] input2                    Input tensor containing the Matrix C (bias) info. Can be nullptr. Data type supported: same as @p input0
-     * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha                     Weight of the matrix product
-     * @param[in] beta                      Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in] gpu_target                GPU Target
-     * @param[in] fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in] activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
-                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
deleted file mode 100644
index c711a3d1f9..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
-class CLGEMMMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyNativeKernel(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyNativeKernel &operator=(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyNativeKernel(CLGEMMMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyNativeKernel &operator=(CLGEMMMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor info. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same of lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index ee8e57fa8c..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedKernel(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedKernel &operator=(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedKernel(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedKernel &operator=(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: 2,3,4,8,16
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: 2,3,4,8,16
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel
-     *
-     * @param[in] input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in] input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index f7d314a039..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.k0: 2,3,4,8,16
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.transpose: true,false
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.k0: 2,3,4,8,16
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.transpose: true,false
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info for the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.transpose: true,false
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
deleted file mode 100644
index 6d70b4b0c2..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the GEMM matrix vector multiply kernel. **/
-class CLGEMMMatrixVectorMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixVectorMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixVectorMultiplyKernel(const CLGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixVectorMultiplyKernel &operator=(const CLGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixVectorMultiplyKernel(CLGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixVectorMultiplyKernel &operator=(CLGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input0 The reshaped input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in]  input1 The 2D reshaped weights tensor. Data type supported: Same as @p input.
-     * @param[out] output The output 2D tensor. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          The reshaped input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in]  input1          The 2D reshaped weights tensor. Data type supported: Same as @p input.
-     * @param[out] output          The output 2D tensor. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixVectorMultiplyKernel
-     *
-     * @param[in] input0 The reshaped input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] input1 The 2D reshaped weights tensor info. Data type supported: Same as @p input.
-     * @param[in] output The output 2D tensor info. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    int              _num_rows_read_per_iteration;
-    BorderSize       _border_size;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
deleted file mode 100644
index fe77fcb428..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
- *  In particular, this function splits the input matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
- *  stores each one in the output matrix unrolling the values
- */
-class CLGEMMReshapeLHSMatrixKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMReshapeLHSMatrixKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeLHSMatrixKernel(const CLGEMMReshapeLHSMatrixKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeLHSMatrixKernel &operator=(const CLGEMMReshapeLHSMatrixKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeLHSMatrixKernel(CLGEMMReshapeLHSMatrixKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeLHSMatrixKernel &operator=(CLGEMMReshapeLHSMatrixKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                   Input tensor. Data types supported: All
-     * @param[out] output                  Output tensor. Data type supported: same as @p input
-     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                     information to reshape the input tensor. Only the following values are supported:
-     *                                     lhs_info.m0: 2,3,4,5,6,7,8
-     *                                     lhs_info.k0: 2,3,4,8,16
-     *                                     lhs_info.v0: greater than 0
-     *                                     lhs_info.transpose: true, false
-     *                                     lhs_info.interleave: true, false
-     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor. Data types supported: All
-     * @param[out] output                  Output tensor. Data type supported: same as @p input
-     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                     information to reshape the input tensor. Only the following values are supported:
-     *                                     lhs_info.m0: 2,3,4,5,6,7,8
-     *                                     lhs_info.k0: 2,3,4,8,16
-     *                                     lhs_info.v0: greater than 0
-     *                                     lhs_info.transpose: true, false
-     *                                     lhs_info.interleave: true, false
-     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeLHSMatrixKernel
-     *
-     * @param[in] input                   Input tensor info. Data types supported: All
-     * @param[in] output                  Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     * @param[in] lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                    information to reshape the input tensor. Only the following values are supported:
-     *                                    lhs_info.m0: 2,3,4,5,6,7,8
-     *                                    lhs_info.k0: 2,3,4,8,16
-     *                                    lhs_info.v0: greater than 0
-     *                                    lhs_info.transpose: true, false
-     *                                    lhs_info.interleave: true, false
-     * @param[in] reinterpret_input_as_3d True if the input has to be reinterpreted as 3D tensor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    bool             _reinterpret_input_as_3d;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
deleted file mode 100644
index 0e6352bdbb..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
- *  In particular, this kernel splits the input matrix in blocks of size K0xN0 and stores each one in
- *  the output matrix unrolling the values */
-class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMReshapeRHSMatrixKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernel(const CLGEMMReshapeRHSMatrixKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernel &operator=(const CLGEMMReshapeRHSMatrixKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeRHSMatrixKernel(CLGEMMReshapeRHSMatrixKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input    Input tensor. Data types supported: All
-     * @param[out] output   Output tensor. Data type supported: same as @p input
-     * @param[in]  rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                      information to reshape the input tensor. Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
-     *                      rhs_info.h0: greater than 0
-     *                      rhs_info.transpose: true, false
-     *                      rhs_info.interleave: true, false
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All
-     * @param[out] output          Output tensor. Data type supported: same as @p input
-     * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                             information to reshape the input tensor. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
-     *                             rhs_info.h0: greater than 0
-     *                             rhs_info.transpose: true, false
-     *                             rhs_info.interleave: true, false
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
-     *
-     * @param[in] input    Input tensor info. Data types supported: All
-     * @param[in] output   Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                     information to reshape the input tensor. Only the following values are supported:
-     *                     rhs_info.n0: 2,3,4,8,16
-     *                     rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
-     *                     rhs_info.h0: greater than 0
-     *                     rhs_info.transpose: true, false
-     *                     rhs_info.interleave: true, false
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGatherKernel.h b/arm_compute/core/CL/kernels/CLGatherKernel.h
deleted file mode 100644
index b7539536e9..0000000000
--- a/arm_compute/core/CL/kernels/CLGatherKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGATHERKERNEL_H
-#define ARM_COMPUTE_CLGATHERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform tensor reshaping */
-class CLGatherKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGatherKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGatherKernel(const CLGatherKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGatherKernel &operator=(const CLGatherKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGatherKernel(CLGatherKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGatherKernel &operator=(CLGatherKernel &&) = default;
-    /** Default destructor */
-    ~CLGatherKernel() = default;
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input
-     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     */
-    void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in]  indices         Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
-     * @param[out] output          Destination tensor. Data type supported: Same as @p input
-     * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
-     *
-     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;   /**< Source tensor */
-    const ICLTensor *_indices; /**< Indices tensor */
-    ICLTensor       *_output;  /**< Destination tensor */
-    int              _axis;    /**< Axis index */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGATHERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
deleted file mode 100644
index 6a9d3eaa4d..0000000000
--- a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Gaussian 3x3 filter kernel.
- *
- */
-class CLGaussian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
deleted file mode 100644
index d8730e0c92..0000000000
--- a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5HorKernel::configure;
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5VertKernel::configure;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
deleted file mode 100644
index 34cd062dae..0000000000
--- a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-#define ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */
-class CLGaussianPyramidHorKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidHorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _l2_load_offset;
-};
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */
-class CLGaussianPyramidVertKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidVertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U16.
-     * @param[out] output Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U16.
-     * @param[out] output          Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _t2_load_offset;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
deleted file mode 100644
index 46dc16d6d5..0000000000
--- a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
-#define ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for Compute All Anchors kernel */
-class CLComputeAllAnchorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLComputeAllAnchorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComputeAllAnchorsKernel(const CLComputeAllAnchorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComputeAllAnchorsKernel &operator=(const CLComputeAllAnchorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLComputeAllAnchorsKernel(CLComputeAllAnchorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLComputeAllAnchorsKernel &operator=(CLComputeAllAnchorsKernel &&) = default;
-    /** Default destructor */
-    ~CLComputeAllAnchorsKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  anchors         Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[out] all_anchors     Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
-     *
-     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_anchors;
-    ICLTensor       *_all_anchors;
-};
-} // arm_compute
-#endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
deleted file mode 100644
index 046950551d..0000000000
--- a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** OpenCL kernel to perform HOG Orientation Binning */
-class CLHOGOrientationBinningKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGOrientationBinningKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGOrientationBinningKernel() = default;
-
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input_magnitude;
-    const ICLTensor *_input_phase;
-    ICLTensor       *_output;
-    Size2D           _cell_size;
-};
-
-/** OpenCL kernel to perform HOG block normalization */
-class CLHOGBlockNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGBlockNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGBlockNormalizationKernel() = default;
-
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info HOG's metadata
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output          Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _num_cells_per_block_stride;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
deleted file mode 100644
index 681c212cc5..0000000000
--- a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/OpenCL.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform HOG detector kernel using linear SVM */
-class CLHOGDetectorKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGDetectorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGDetectorKernel() = default;
-
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
-                   const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue);
-
-private:
-    const ICLTensor         *_input;
-    ICLDetectionWindowArray *_detection_windows;
-    cl::Buffer              *_num_detection_windows;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDETECTORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
deleted file mode 100644
index a13119b82c..0000000000
--- a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-#define ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the harris score kernel.
- *
- * @note The implementation supports 3, 5, and 7 for the block_size.
- */
-class CLHarrisScoreKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHarrisScoreKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default;
-    /** Default destructor */
-    ~CLHarrisScoreKernel() = default;
-
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-    /** Setup the kernel parameters
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-protected:
-    const ICLImage *_input1;          /**< Source image - Gx component */
-    const ICLImage *_input2;          /**< Source image - Gy component */
-    ICLImage       *_output;          /**< Source image - Harris score */
-    float           _sensitivity;     /**< Sensitivity value */
-    float           _strength_thresh; /**< Threshold value */
-    float           _norm_factor;     /**< Normalization factor */
-    BorderSize      _border_size;     /**< Border size */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHARRISCORNERSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
deleted file mode 100644
index 524e5ea997..0000000000
--- a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the height concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLHeightConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHeightConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel &operator=(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel(CLHeightConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel &operator=(CLHeightConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLHeightConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input         Input tensor. Data types supported: All.
-     * @param[in]  height_offset The starting offset on the Y axis for the output tensor.
-     * @param[out] output        Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[in]  height_offset   The starting offset on the Y axis for the output tensor.
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int height_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLHeightConcatenateLayerKernel
-     *
-     * @param[in] input         Input tensor info. Data types supported: All.
-     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _height_offset;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHistogramKernel.h b/arm_compute/core/CL/kernels/CLHistogramKernel.h
deleted file mode 100644
index 9cd374711b..0000000000
--- a/arm_compute/core/CL/kernels/CLHistogramKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-#define ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLDistribution1D;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16.
- *  If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel
- */
-class CLHistogramKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel(const CLHistogramKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel &operator=(const CLHistogramKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel(CLHistogramKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel &operator=(CLHistogramKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-
-/** Interface to run the histogram kernel to handle the leftover part of image
- *
- */
-class CLHistogramBorderKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHISTOGRAMKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
deleted file mode 100644
index 61f2a3d489..0000000000
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLIM2COLKERNEL_H
-#define ARM_COMPUTE_CLIM2COLKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class CLIm2ColKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel(const CLIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel(CLIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                         while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     *                         This is valid only for non-quantized inputs.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                         Number of groups other than 1 is only supported for NCHW data layout.
-     *                         Number of groups should be multiple to the number of channels.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                   unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                             while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims     The kernel dimensions (width and height).
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                        while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     *                        This is valid only for non-quantized inputs.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                        Number of groups other than 1 is only supported for NCHW data layout.
-     *                        Number of groups should be multiple to the number of channels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLIM2COLKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
deleted file mode 100644
index 014dce1759..0000000000
--- a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for performing an instance normalization */
-class CLInstanceNormalizationLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLInstanceNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLInstanceNormalizationLayerKernel(const CLInstanceNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLInstanceNormalizationLayerKernel &operator=(const CLInstanceNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLInstanceNormalizationLayerKernel(CLInstanceNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLInstanceNormalizationLayerKernel &operator=(CLInstanceNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLInstanceNormalizationLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in, out] input  Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
-     *                        In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      info   Kernel meta-data descriptor
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
-     *                                 In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      info            Kernel meta-data descriptor
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
-     *
-     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
-     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] info   Kernel meta-data descriptor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
deleted file mode 100644
index 6b6076a917..0000000000
--- a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-#define ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to run the horizontal pass of the integral image kernel. */
-class CLIntegralImageHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8
-     * @param[out] output Destination tensor, Data types supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          Destination tensor, Data types supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-};
-
-/** Interface to run the vertical pass of the integral image kernel. */
-class CLIntegralImageVertKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIntegralImageVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in,out] in_out The input/output tensor. Data types supported: U32
-     */
-    void configure(ICLTensor *in_out);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] in_out          The input/output tensor. Data types supported: U32
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *in_out);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_in_out;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
deleted file mode 100644
index 169910b70d..0000000000
--- a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
-#define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
-class CLL2NormalizeLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLL2NormalizeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLL2NormalizeLayerKernel(const CLL2NormalizeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLL2NormalizeLayerKernel &operator=(const CLL2NormalizeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLL2NormalizeLayerKernel(CLL2NormalizeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLL2NormalizeLayerKernel &operator=(CLL2NormalizeLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLL2NormalizeLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input   Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  sum     Sum values tensor. Data types supported: same as @p input.
-     *                     Sum will have the same number of dimensions as input.
-     * @param[out] output  Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                     Output will have the same number of dimensions as input.
-     * @param[in]  axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in]  epsilon Lower bound value for the normalization.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  sum             Sum values tensor. Data types supported: same as @p input.
-     *                             Sum will have the same number of dimensions as input.
-     * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                             Output will have the same number of dimensions as input.
-     * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in]  epsilon         Lower bound value for the normalization.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
-     *
-     * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] sum     Sum values tensor info. Data types supported: same as @p input.
-     *                    Sum will have the same number of dimensions as input.
-     * @param[in] output  Destination tensor info. Data types and data layouts supported: Same as @p input.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in] axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in] epsilon Lower bound value for the normalization.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_sum;
-    ICLTensor       *_output;
-    unsigned int     _actual_axis;
-    float            _epsilon;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
deleted file mode 100644
index f94602c381..0000000000
--- a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLKTRACKERKERNEL_H
-#define ARM_COMPUTE_CLLKTRACKERKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Internal keypoint structure for Lucas-Kanade Optical Flow */
-struct CLLKInternalKeypoint
-{
-    float x{ 0.f };               /**< x coordinate of the keypoint */
-    float y{ 0.f };               /**< y coordinate of the keypoint */
-    float tracking_status{ 0.f }; /**< the tracking status of the keypoint */
-    float dummy{ 0.f };           /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
-};
-
-/** Structure for storing Spatial Gradient Matrix and the minimum eigenvalue for each keypoint */
-struct CLCoefficientTable
-{
-    float A11;     /**< iA11 * FLT_SCALE */
-    float A12;     /**< iA11 * FLT_SCALE */
-    float A22;     /**< iA11 * FLT_SCALE */
-    float min_eig; /**< Minimum eigenvalue */
-};
-
-/** Structure for storing ival, ixval and iyval for each point inside the window */
-struct CLOldValue
-{
-    int16_t ival;  /**< ival extracts from old image */
-    int16_t ixval; /**< ixval extracts from scharr Gx image */
-    int16_t iyval; /**< iyval extracts from scharr Gy image */
-    int16_t dummy; /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
-};
-
-/** Interface for OpenCL Array of Internal Key Points. */
-using ICLLKInternalKeypointArray = ICLArray<CLLKInternalKeypoint>;
-/** Interface for OpenCL Array of Coefficient Tables. */
-using ICLCoefficientTableArray = ICLArray<CLCoefficientTable>;
-/** Interface for OpenCL Array of Old Values. */
-using ICLOldValArray = ICLArray<CLOldValue>;
-
-/** Interface to run the initialization step of LKTracker */
-class CLLKTrackerInitKernel : public ICLKernel
-{
-public:
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
-     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
-     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
-     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]  level                The pyramid level
-     * @param[in]  num_levels           The number of pyramid levels
-     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  compile_context      The compile context to be used.
-     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
-     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
-     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
-     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]  level                The pyramid level
-     * @param[in]  num_levels           The number of pyramid levels
-     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the finalize step of LKTracker, where it truncates the coordinates stored in new_points array */
-class CLLKTrackerFinalizeKernel : public ICLKernel
-{
-public:
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
-     */
-    void configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
-     */
-    void configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the first stage of LKTracker, where A11, A12, A22, min_eig, ival, ixval and iyval are computed */
-class CLLKTrackerStage0Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLKTrackerStage0Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default;
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
-     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
-     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
-     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
-     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[out]     old_ival            Pointer to the array holding internal values
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   size_t window_dimension, size_t level);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      compile_context     The compile context to be used.
-     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
-     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
-     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
-     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
-     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[out]     old_ival            Pointer to the array holding internal values
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   size_t window_dimension, size_t level);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_old_input;
-    const ICLTensor *_old_scharr_gx;
-    const ICLTensor *_old_scharr_gy;
-};
-
-/** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */
-class CLLKTrackerStage1Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLKTrackerStage1Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default;
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
-     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[in]      old_ival            Pointer to the array holding internal values
-     * @param[in]      termination         The criteria to terminate the search of each keypoint.
-     * @param[in]      epsilon             The error for terminating the algorithm
-     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      compile_context     The compile context to be used.
-     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
-     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[in]      old_ival            Pointer to the array holding internal values
-     * @param[in]      termination         The criteria to terminate the search of each keypoint.
-     * @param[in]      epsilon             The error for terminating the algorithm
-     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_new_input;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLLKTRACKERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
deleted file mode 100644
index e68160f96d..0000000000
--- a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply each row of first tensor with low 2 dimensions of second tensor.
- *
- * @attention The second input tensor must have at least 2 dimensions (matrix)
- *
- */
-class CLLocallyConnectedMatrixMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLocallyConnectedMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLocallyConnectedMatrixMultiplyKernel(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLocallyConnectedMatrixMultiplyKernel &operator=(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLocallyConnectedMatrixMultiplyKernel(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLocallyConnectedMatrixMultiplyKernel &operator=(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  input0 First input tensor. Data types supported: F32
-     * @param[in]  input1 Second input tensor. Data type supported: same as @p input0
-     * @param[out] output Output tensor to store the result. Data type supported: same as @p input0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          First input tensor. Data types supported: F32
-     * @param[in]  input1          Second input tensor. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result. Data type supported: same as @p input0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLocallyConnectedMatrixMultiplyKernel
-     *
-     * @param[in] input0 First input tensor info. Data types supported: F32
-     * @param[in] input1 Second input tensor info. Data type supported: same as @p input0
-     * @param[in] output Output tensor info. Data type supported: same as @p input0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
deleted file mode 100644
index e0de3e7636..0000000000
--- a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-#define ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Template interface for the kernel to compute magnitude and phase.
- *
- */
-class CLMagnitudePhaseKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLMagnitudePhaseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default;
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  gx         The input gradient X tensor. Data types supported: S16.
-     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16.
-     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16.
-     * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              The input gradient X tensor. Data types supported: S16.
-     * @param[in]  gy              The input gradient Y tensor. Data types supported: S16.
-     * @param[out] magnitude       (Optional) The output tensor - Magnitude. Data types supported: S16.
-     * @param[out] phase           (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Input gradient X. */
-    const ICLTensor *_gy;        /**< Input gradient Y. */
-    ICLTensor       *_magnitude; /**< Output - Magnitude. */
-    ICLTensor       *_phase;     /**< Output - Phase. */
-    bool             _run_mag;   /**< Calculate magnitude ? */
-    bool             _run_phase; /**< Calculate phase ? */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
deleted file mode 100644
index 96b4c4ea60..0000000000
--- a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-#define ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
-class CLMeanStdDevKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMeanStdDevKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevKernel.
-     *
-     * @param[in] input              Input image info. Data types supported: U8.
-     * @param[in] mean               Input average pixel value.
-     * @param[in] global_sum         Keeps global sum of pixel values.
-     * @param[in] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[in] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    float          *_mean;
-    float          *_stddev;
-    cl::Buffer     *_global_sum;
-    cl::Buffer     *_global_sum_squared;
-    BorderSize      _border_size;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMEANSTDDEVKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
deleted file mode 100644
index ff0c96e168..0000000000
--- a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to normalize the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. */
-class CLMeanStdDevNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMeanStdDevNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevNormalizationKernel(const CLMeanStdDevNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevNormalizationKernel &operator=(const CLMeanStdDevNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevNormalizationKernel(CLMeanStdDevNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevNormalizationKernel &operator=(CLMeanStdDevNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLMeanStdDevNormalizationKernel() = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
-     *
-     * @param[in, out] input   Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
-     *                         this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[out]     output  (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in]      epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     */
-    void configure(ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
-     *                                 this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
-     *
-     * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
-     *                    this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[in] output  (Optional) Destination tensor info. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output = nullptr, float epsilon = 1e-8f);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
deleted file mode 100644
index c68ab07781..0000000000
--- a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the median 3x3 filter kernel.
- *
- */
-class CLMedian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMEDIAN3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMemsetKernel.h b/arm_compute/core/CL/kernels/CLMemsetKernel.h
deleted file mode 100644
index 430bc1d4f2..0000000000
--- a/arm_compute/core/CL/kernels/CLMemsetKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
-#define ARM_COMPUTE_CLMEMSETKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for filling the planes of a tensor */
-class CLMemsetKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMemsetKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMemsetKernel(const CLMemsetKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMemsetKernel &operator=(const CLMemsetKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMemsetKernel(CLMemsetKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMemsetKernel &operator=(CLMemsetKernel &&) = default;
-    /** Default destructor */
-    ~CLMemsetKernel() = default;
-
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
-     * @param[in]     constant_value The value used to fill the planes of the tensor
-     * @param[in]     window         Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
-     * @param[in]     constant_value  The value used to fill the planes of the tensor
-     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMemsetKernel
-     *
-     * @param[in] tensor         Source tensor info. Data types supported: All.
-     * @param[in] constant_value The value used to fill the planes of the tensor
-     * @param[in] window         Window to be used in case setting only part of a tensor. Default is nullptr.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_tensor;
-    Window     _full_window;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
deleted file mode 100644
index 5f9685f303..0000000000
--- a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor.
- */
-class CLMinMaxLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output          Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    /** Resets global minimum and maximum
-     *
-     * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
-     */
-    void reset(cl::CommandQueue &queue);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
deleted file mode 100644
index afb134fa59..0000000000
--- a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to perform min max search on an image.
- */
-class CLMinMaxKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel(const CLMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel(CLMinMaxKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input   Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;               /**< Input image. */
-    cl::Buffer      *_min_max;             /**< Minimum/maximum value. */
-    std::array<int, 2> _data_type_max_min; /**< Maximum and minimum data type value respectively. */
-};
-
-/** Interface for the kernel to find min max locations of an image.
- */
-class CLMinMaxLocationKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLMinMaxLocationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  input         Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max       Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc       (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc       (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count   Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc         (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc         (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage *_input;         /**< Input image. */
-    cl::Buffer     *_min_max_count; /**< Minimum/maximum value occurrences. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
deleted file mode 100644
index 1f337356e9..0000000000
--- a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-#define ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to apply a non-linear filter */
-class CLNonLinearFilterKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default constructor */
-    CLNonLinearFilterKernel();
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index a256bc798d..0000000000
--- a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL
- *
- * @note Used by @ref CLFastCorners and @ref CLHarrisCorners
- */
-class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
deleted file mode 100644
index 2511818ef2..0000000000
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the normalization layer kernel.
- */
-class CLNormalizationLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizationLayerKernel(const CLNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizationLayerKernel &operator=(const CLNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                       and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
-     *                       Data layouts supported: same as @p input.
-     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                             and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out] output          Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
-     *                             Data layouts supported: same as @p input.
-     * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
-     *
-     * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
-     *                      Data layouts supported: same as @p input.
-     * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    BorderSize       _border_size;
-    bool             _is_norm_across_width;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
deleted file mode 100644
index d247e1fddc..0000000000
--- a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
-#define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the NormalizePlanarYUV layer kernel. */
-class CLNormalizePlanarYUVLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLNormalizePlanarYUVLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizePlanarYUVLayerKernel(const CLNormalizePlanarYUVLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizePlanarYUVLayerKernel &operator=(const CLNormalizePlanarYUVLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLNormalizePlanarYUVLayerKernel(CLNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLNormalizePlanarYUVLayerKernel &operator=(CLNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLNormalizePlanarYUVLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output          Destination tensor. Data type supported: same as @p input
-     * @param[in]  mean            Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
-     *                             Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
-     *
-     * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor info. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_mean;
-    const ICLTensor *_std;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/arm_compute/core/CL/kernels/CLPadLayerKernel.h
deleted file mode 100644
index 166c202335..0000000000
--- a/arm_compute/core/CL/kernels/CLPadLayerKernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPADLAYERKERNEL_H
-#define ARM_COMPUTE_CLPADLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the PadLayer function. */
-class CLPadLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLPadLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPadLayerKernel(const CLPadLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPadLayerKernel(CLPadLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLPadLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @param[in]  input          Source tensor. Data types supported: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
-     * @param[out] output         Output tensor. Data type supported: same as @p input
-     * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                            specifies the front and the end padding in the i-th dimension.
-     * @param[in]  constant_value (Optional) Constant value to be used for the padding.
-     * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
-    /** Set the input and output tensor.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: same as @p input
-     * @param[in]  padding         The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                             specifies the front and the end padding in the i-th dimension.
-     * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
-     * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
-     *
-     * @param[in] input          Source tensor info. Data types supported: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
-     * @param[in] output         Output tensor info. Data type supported: same as @p input
-     * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                           specifies the front and the end padding in the i-th dimension.
-     * @param[in] constant_value (Optional) Constant value to be used for the padding.
-     * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    int              _input_start_x;
-    int              _input_start_y;
-    bool             _4d_enabled;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPADLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPermuteKernel.h b/arm_compute/core/CL/kernels/CLPermuteKernel.h
deleted file mode 100644
index 1a9240ef6b..0000000000
--- a/arm_compute/core/CL/kernels/CLPermuteKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPERMUTEKERNEL_H
-#define ARM_COMPUTE_CLPERMUTEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform tensor permutation.
- *
- * Permutes given a permutation vector
- */
-class CLPermuteKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLPermuteKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPermuteKernel(const CLPermuteKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPermuteKernel &operator=(const CLPermuteKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPermuteKernel(CLPermuteKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPermuteKernel &operator=(CLPermuteKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] input  The input tensor to permute. Data types supported: All.
-     * @param[in] output The output tensor. Data types supported: Same as @p input
-     * @param[in] perm   Permutation vector
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
-    /** Set the input and output of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           The input tensor to permute. Data types supported: All.
-     * @param[in] output          The output tensor. Data types supported: Same as @p input
-     * @param[in] perm            Permutation vector
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPermuteKernel
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] input  First tensor input info. Data types supported: All.
-     * @param[in] output Output tensor info. Data types supported: same as @p input.
-     * @param[in] perm   Permutation vector
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor        *_output;
-    PermutationVector _perm;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLPERMUTEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 52a09d9a49..0000000000
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the pixelwise multiplication kernel. */
-class CLPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
-    /** Default constructor.*/
-    CLPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPixelWiseMultiplicationKernel(const CLPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPixelWiseMultiplicationKernel &operator=(const CLPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPixelWiseMultiplicationKernel(CLPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported:
-     *                             - U8, only if both input are U8
-     *                             - QASYMM8, only if both inputs are QASYMM8
-     *                             - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                             - S16
-     *                             - QSYMM16, only if both inputs are QSYMM16
-     *                             - S32, only if both inputs are QSYMM16
-     *                             - F16
-     *                             - F32
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
-     *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-     * @param[in] output          The output tensor info, Data types supported:
-     *                            - U8, only if both input are U8
-     *                            - QASYMM8, only if both inputs are QASYMM8
-     *                            - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                            - S16
-     *                            - QSYMM16, only if both inputs are QSYMM16
-     *                            - S32, only if both inputs are QSYMM16
-     *                            - F16
-     *                            - F32
-     * @param[in] scale           Scale to apply after multiplication.
-     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class CLComplexPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
-    /** Default constructor.*/
-    CLComplexPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComplexPixelWiseMultiplicationKernel(const CLComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComplexPixelWiseMultiplicationKernel &operator=(const CLComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLComplexPixelWiseMultiplicationKernel(CLComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLComplexPixelWiseMultiplicationKernel &operator=(CLComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1   An input tensor. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  input2   An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output   The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplicationKernel
-     *
-     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
-     * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
deleted file mode 100644
index 395750440c..0000000000
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the pooling layer kernel */
-class CLPoolingLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPoolingLayerKernel(const CLPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPoolingLayerKernel &operator=(const CLPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPoolingLayerKernel(CLPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPoolingLayerKernel &operator=(CLPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
-    /** Set the input and output tensors.
-     *
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    ICLTensor       *_indices;
-    PoolingLayerInfo _pool_info;
-    DataLayout       _data_layout;
-    BorderSize       _border_size;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h b/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
deleted file mode 100644
index 5fd27d9233..0000000000
--- a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
-#define ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the PriorBox layer kernel. */
-class CLPriorBoxLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLPriorBoxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPriorBoxLayerKernel(const CLPriorBoxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPriorBoxLayerKernel &operator=(const CLPriorBoxLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLPriorBoxLayerKernel(CLPriorBoxLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLPriorBoxLayerKernel &operator=(CLPriorBoxLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLPriorBoxLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input1        First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input2        Second source tensor. Data types and layouts supported: same as @p input1
-     * @param[out] output        Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
-     * @param[in]  info          Prior box layer info.
-     * @param[in]  min           Minimum prior box values
-     * @param[in]  max           Maximum prior box values
-     * @param[in]  aspect_ratios Aspect ratio values
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input2          Second source tensor. Data types and layouts supported: same as @p input1
-     * @param[out] output          Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
-     * @param[in]  info            Prior box layer info.
-     * @param[in]  min             Minimum prior box values
-     * @param[in]  max             Maximum prior box values
-     * @param[in]  aspect_ratios   Aspect ratio values
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
-                   cl::Buffer *aspect_ratios);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
-     *
-     * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] input2 Second source tensor info. Data types and layouts supported: same as @p input1
-     * @param[in] output Destination tensor info. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input1
-     * @param[in] info   Prior box layer info.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor        *_output;
-    PriorBoxLayerInfo _info;
-    int               _num_priors;
-    cl::Buffer       *_min;
-    cl::Buffer       *_max;
-    cl::Buffer       *_aspect_ratios;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
deleted file mode 100644
index 2d4707245f..0000000000
--- a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to do layer normalization. */
-class CLQLSTMLayerNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLQLSTMLayerNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQLSTMLayerNormalizationKernel(const CLQLSTMLayerNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQLSTMLayerNormalizationKernel &operator=(const CLQLSTMLayerNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLQLSTMLayerNormalizationKernel(CLQLSTMLayerNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLQLSTMLayerNormalizationKernel &operator=(CLQLSTMLayerNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLQLSTMLayerNormalizationKernel() = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input  Source tensor with 2 dimensions. Data types supported: QSYMM16.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  weight Weight tensor. Data types supported: Same as @p input.
-     * @param[in]  bias   Bias tensor. Data types supported: S32.
-     *
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor with 2 dimensions. Data types supported: QSYMM16.
-     * @param[out] output          Destination tensor. Data type supported: same as @p input
-     * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
-     * @param[in]  bias            Bias tensor. Data types supported: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
-     *
-     * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
-     * @param[in] output Destination info tensor. Data type supported: same as @p input
-     * @param[in] weight Weight info tensor. Data types supported: Same as @p input.
-     * @param[in] bias   Bias tensor info. Data types supported: S32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_weight;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
deleted file mode 100644
index de30447e17..0000000000
--- a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors.
- */
-class CLQuantizationLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLQuantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQuantizationLayerKernel(const CLQuantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQuantizationLayerKernel &operator=(const CLQuantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLQuantizationLayerKernel(CLQuantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLQuantizationLayerKernel &operator=(CLQuantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLQuantizationLayerKernel() = default;
-    /** Set the input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output          Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] output Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h b/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
deleted file mode 100644
index 30bdbb1844..0000000000
--- a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
-#define ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the RoIAlign kernel.
- */
-class CLROIAlignLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLROIAlignLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIAlignLayerKernel(const CLROIAlignLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIAlignLayerKernel &operator=(const CLROIAlignLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLROIAlignLayerKernel(CLROIAlignLayerKernel &&) = default;
-    /** Default move assignment operator. */
-    CLROIAlignLayerKernel &operator=(CLROIAlignLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLROIAlignLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                             Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED,
-     *                      otherwise same as @p input
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue);
-
-private:
-    const ICLTensor    *_input;
-    ICLTensor          *_output;
-    const ICLTensor    *_rois;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
deleted file mode 100644
index ea70a58188..0000000000
--- a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/CL/ICLArray.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the ROI pooling layer kernel */
-class CLROIPoolingLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLROIPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIPoolingLayerKernel(const CLROIPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIPoolingLayerKernel &operator=(const CLROIPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLROIPoolingLayerKernel(CLROIPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLROIPoolingLayerKernel &operator=(CLROIPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLROIPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: F16/F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F16/F32.
-     * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor    *_input;
-    const ICLTensor    *_rois;
-    ICLTensor          *_output;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLRangeKernel.h b/arm_compute/core/CL/kernels/CLRangeKernel.h
deleted file mode 100644
index fc8db98bf9..0000000000
--- a/arm_compute/core/CL/kernels/CLRangeKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLRANGEKERNEL_H
-#define ARM_COMPUTE_CLRANGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Kernel class for Range
- *
- * range generates a 1-D tensor containing a sequence of numbers that begins at 'start' and extends by increments
- * of 'step' up to but not including 'end'.
- */
-class CLRangeKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLRangeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRangeKernel(const CLRangeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRangeKernel &operator=(const CLRangeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLRangeKernel(CLRangeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLRangeKernel &operator=(CLRangeKernel &&) = default;
-    /** Default destructor */
-    ~CLRangeKernel() = default;
-    /** Initialize the kernel's output tensor, start, end and step of the sequence.
-     *
-     * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  start  The starting value of the sequence.
-     * @param[in]  end    The ending (not including) value of the sequence.
-     * @param[in]  step   The gap between each pair of values in the sequence.
-     */
-    void configure(ICLTensor *output, float start, float end, float step);
-    /** Initialize the kernel's output tensor, start, end and step of the sequence.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[out] output          Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  start           The starting value of the sequence.
-     * @param[in]  end             The ending (not including) value of the sequence.
-     * @param[in]  step            The gap between each pair of values in the sequence.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLRangeKernel
-     *
-     * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in] start  The starting value of the sequence.
-     * @param[in] end    The ending (not including) value of the sequence.
-     * @param[in] step   The gap between each pair of values in the sequence.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *output, float start, float end, float step);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    float      _start;  /**< Start of sequence */
-    float      _end;    /**< End of sequence */
-    float      _step;   /**< Increment/step value */
-    ICLTensor *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLRANGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
deleted file mode 100644
index 0b0b4ae9b0..0000000000
--- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
-#define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel
- */
-class CLReductionOperationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReductionOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReductionOperationKernel(const CLReductionOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReductionOperationKernel &operator=(const CLReductionOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReductionOperationKernel(CLReductionOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReductionOperationKernel &operator=(CLReductionOperationKernel &&) = default;
-    /** Default destructor */
-    ~CLReductionOperationKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in]  width  (Optional)  In case of x-axis we also need to provide the width of the input image.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                             Output will have the same number of dimensions as input.
-     * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in]  width           (Optional)  In case of x-axis we also need to provide the width of the input image.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
-     *                   Output will have the same number of dimensions as input.
-     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in] op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in] width  (Optional)  In case of x-axis we also need to provide the width of the input image.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor   *_input;
-    ICLTensor         *_output;
-    unsigned int       _reduction_axis;
-    ReductionOperation _op;
-    BorderSize         _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLRemapKernel.h b/arm_compute/core/CL/kernels/CLRemapKernel.h
deleted file mode 100644
index f3d1511905..0000000000
--- a/arm_compute/core/CL/kernels/CLRemapKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAPKERNEL_H
-#define ARM_COMPUTE_CLREMAPKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a remap on a tensor */
-class CLRemapKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLRemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel(const CLRemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel(CLRemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel &operator=(CLRemapKernel &&) = default;
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
-     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy           The interpolation type.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
-     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy           The interpolation type.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_map_x;
-    const ICLTensor *_map_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREMAPKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h b/arm_compute/core/CL/kernels/CLReorgLayerKernel.h
deleted file mode 100644
index 9c064858af..0000000000
--- a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREORGLAYERKERNEL_H
-#define ARM_COMPUTE_CLREORGLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a reorg layer */
-class CLReorgLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReorgLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLReorgLayerKernel(const CLReorgLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLReorgLayerKernel &operator=(const CLReorgLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReorgLayerKernel(CLReorgLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReorgLayerKernel &operator=(CLReorgLayerKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[out] output Destination tensor with tensor shape:
-     *                    [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
-     *                    the same number of input elements. Data types supported: same as @p input.
-     * @param[in]  stride Stride value to use for reorganizing the values in the output tensor.
-     *                    It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t stride);
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[out] output          Destination tensor with tensor shape:
-     *                             [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
-     *                             the same number of input elements. Data types supported: same as @p input.
-     * @param[in]  stride          Stride value to use for reorganizing the values in the output tensor.
-     *                             It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReorgLayerKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: All.
-     * @param[in] output Destination tensor with tensor shape:
-     *                   [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
-     *                   the same number of input elements. Data types supported: same as @p input. Data types supported: same as @p input.
-     * @param[in] stride Stride value to use for reorganizing the values in the output tensor
-     *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREORGLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
deleted file mode 100644
index 3ea74114d0..0000000000
--- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
-#define ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform tensor reshaping */
-class CLReshapeLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReshapeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel(const CLReshapeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel &operator=(const CLReshapeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel(CLReshapeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel &operator=(CLReshapeLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLReshapeLayerKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All.
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: All.
-     * @param[out] output          Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: All
-     * @param[in] output Destination tensor info. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLRESHAPELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReverseKernel.h b/arm_compute/core/CL/kernels/CLReverseKernel.h
deleted file mode 100644
index e8f4507969..0000000000
--- a/arm_compute/core/CL/kernels/CLReverseKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
-#define ARM_COMPUTE_CLREVERSEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reverse kernel */
-class CLReverseKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReverseKernel(const CLReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReverseKernel &operator=(const CLReverseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReverseKernel(CLReverseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReverseKernel &operator=(CLReverseKernel &&) = default;
-    /** Default destructor */
-    ~CLReverseKernel() = default;
-    /** Initialise the kernel's inputis and output
-     *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
-    /** Initialise the kernel's inputis and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All.
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_axis;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREVERSEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
deleted file mode 100644
index 328578d88c..0000000000
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCALEKERNEL_H
-#define ARM_COMPUTE_CLSCALEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the scale kernel */
-class CLScaleKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy          Interpolation type to use
-     * @param[in]  border_mode     Selected border mode.
-     * @param[in]  sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]  align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool align_corners = false);
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy          Interpolation type to use
-     * @param[in]  border_mode     Selected border mode.
-     * @param[in]  sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]  align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode,
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool align_corners = false);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLScaleKernel
-     *
-     * @param[in] input           Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] output          Destination tensor info. Data types supported: Same as @p input
-     *                            All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy          Interpolation type to use
-     * @param[in] border_mode     Selected border mode.
-     * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy = SamplingPolicy::CENTER,
-                           bool align_corners = false);
-    /** Input tensor accessor.
-     *
-     * @return Pointer to input tensor.
-     */
-    const ICLTensor *input() const;
-    /** Output tensor accessor.
-     *
-     * @return Pointer to output tensor.
-     */
-    const ICLTensor *output() const;
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    InterpolationPolicy _interpolationPolicy = InterpolationPolicy::BILINEAR;
-    DataLayout          _data_layout         = DataLayout::UNKNOWN;
-    bool                _align_corners       = false;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCALEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
deleted file mode 100644
index 209a150a67..0000000000
--- a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-#define ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
- *
- * @f[
- *      \mathbf{G}_x=\begin{vmatrix}
- *      -3 & 0 & +3\\
- *      -10& 0 & +10\\
- *      -3 & 0 & +3
- *      \end{vmatrix}
- * @f]
- * @f[
- *      \mathbf{G}_y=\begin{vmatrix}
- *      -3 & -10 & -3\\
- *       0 & 0 & 0\\
- *      +3 & +10 & +3
- *      \end{vmatrix}
- * @f]
- */
-class CLScharr3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLScharr3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    bool             _run_scharr_x; /**< Do we need to run Scharr X ? */
-    bool             _run_scharr_y; /**< Do we need to run Scharr Y ? */
-    const ICLTensor *_input;        /**< Input image */
-    ICLTensor       *_output_x;     /**< Output image for scharr X */
-    ICLTensor       *_output_y;     /**< Output image for scharr Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCHARR3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSelectKernel.h b/arm_compute/core/CL/kernels/CLSelectKernel.h
deleted file mode 100644
index 5cbd985cda..0000000000
--- a/arm_compute/core/CL/kernels/CLSelectKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSELECTKERNEL_H
-#define ARM_COMPUTE_CLSELECTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL interface for executing the select kernel
- *
- * Select is computed by:
- * @f[ output(i) = condition(i) ? x(i) : y(i) @f]
- **/
-class CLSelectKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLSelectKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSelectKernel(const CLSelectKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSelectKernel &operator=(const CLSelectKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSelectKernel(CLSelectKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSelectKernel &operator=(CLSelectKernel &&) = default;
-    /** Default destructor */
-    ~CLSelectKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  c      Condition input tensor. Data types supported: U8.
-     * @param[in]  x      First input tensor. Data types supported: All.
-     * @param[out] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in]  output Output tensor. Data types supported: Same as @p x.
-     */
-    void configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  c               Condition input tensor. Data types supported: U8.
-     * @param[in]  x               First input tensor. Data types supported: All.
-     * @param[out] y               Second input tensor. Data types supported: Same as @p x
-     * @param[in]  output          Output tensor. Data types supported: Same as @p x.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_c;             /**< Condition tensor */
-    const ICLTensor *_x;             /**< Source tensor 1 */
-    const ICLTensor *_y;             /**< Source tensor 2 */
-    ICLTensor       *_output;        /**< Destination tensor */
-    bool             _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWHEREKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
deleted file mode 100644
index 4240fe80b3..0000000000
--- a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-#define ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */
-class CLSobel3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default;
-    /** Default destructor */
-    ~CLSobel3x3Kernel() = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< Output tensor for Sobel X */
-    ICLTensor       *_output_y;    /**< Output tensor for Sobel Y */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
deleted file mode 100644
index ef30f0ec93..0000000000
--- a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-#define ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL5X5KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
deleted file mode 100644
index 4eda5a40d4..0000000000
--- a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-#define ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL7X7KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
deleted file mode 100644
index b174f493b5..0000000000
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
-#include "arm_compute/core/KernelDescriptors.h"
-
-#include <tuple>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the identifying the max value of 1D Logits */
-class CLLogits1DMaxKernel : public ICLSimple3DKernel
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-/** Interface for shifting, exponentiating and summing the logits */
-class CLLogits1DShiftExpSumKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLogits1DShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DShiftExpSumKernel(const CLLogits1DShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DShiftExpSumKernel &operator=(const CLLogits1DShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DShiftExpSumKernel(CLLogits1DShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.0
-     */
-    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in]  max             Max values tensor. Data types supported: same as @p input
-     * @param[out] output          Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[out] sum             Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in]  beta            (Optional) A scaling factor for the exponent. Defaults to 1.0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DShiftExpSumKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in] max    Max values tensor. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_max;
-    ICLTensor       *_output;
-    ICLTensor       *_sum;
-};
-
-/** Interface for max, shifting, exponentiating and summing the logits */
-class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
-{
-public:
-    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
-    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
-
-public:
-    /** Default constructor */
-    CLLogits1DMaxShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]     input  Source tensor. Data types supported: F16/F32
-     * @param[in,out] max    Max values tensor. Data types supported: same as @p input
-     * @param[out]    output Destination tensor. Data types supported: same as @p input
-     * @param[out]    sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     * @param[in]     info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: F16/F32
-     * @param[in,out] max             Max values tensor. Data types supported: same as @p input
-     * @param[out]    output          Destination tensor. Data types supported: same as @p input
-     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p input
-     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: F16/F32
-     * @param[in] max    Max values tensor. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
-    /** Checks if the given size is eligible for parallel reduction
-     *
-     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
-     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
-     *
-     * @param[in] size Size to check
-     *
-     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
-     *         while the second element is the vector size of the execution.
-     */
-    static ParallelReductionInfo is_parallel_reduction(size_t size);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_max;
-    ICLTensor       *_output;
-    ICLTensor       *_sum;
-
-private:
-    static const unsigned int _grid_size;
-    static const unsigned int _serial_vector_size;
-    static const unsigned int _parallel_vector_size;
-};
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class CLLogits1DNormKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLogits1DNormKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DNormKernel(const CLLogits1DNormKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DNormKernel &operator=(const CLLogits1DNormKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DNormKernel(CLLogits1DNormKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: S32/F16/F32
-     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in]  info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32
-     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output          Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: S32/F16/F32
-     * @param[in] sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in] info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_sum;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
deleted file mode 100644
index 799b7b16c3..0000000000
--- a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
-#define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the space to batch kernel */
-class CLSpaceToBatchLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLSpaceToBatchLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToBatchLayerKernel(const CLSpaceToBatchLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToBatchLayerKernel &operator=(const CLSpaceToBatchLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSpaceToBatchLayerKernel(CLSpaceToBatchLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSpaceToBatchLayerKernel &operator=(CLSpaceToBatchLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLSpaceToBatchLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings        2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
-    /** Initialise the kernel's input and output. (Static block shape and paddings)
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
-    /** Initialise the kernel's input and output. (Static block shape and paddings)
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x   Block shape x value.
-     * @param[in]  block_shape_y   Block shape y value.
-     * @param[in]  padding_left    The left padding of the output tensor.
-     * @param[in]  padding_right   The right padding of the output tensor.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    const ICLTensor *_block_shape; /**< Block shape tensor */
-    const ICLTensor *_paddings;    /**< Paddings tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
deleted file mode 100644
index f2371e7d87..0000000000
--- a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
-#define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the space to depth kernel */
-class CLSpaceToDepthLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLSpaceToDepthLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToDepthLayerKernel(const CLSpaceToDepthLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToDepthLayerKernel &operator=(const CLSpaceToDepthLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSpaceToDepthLayerKernel(CLSpaceToDepthLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSpaceToDepthLayerKernel &operator=(CLSpaceToDepthLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLSpaceToDepthLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape value.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape     Block shape value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-    int32_t          _block_shape; /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLStackLayerKernel.h b/arm_compute/core/CL/kernels/CLStackLayerKernel.h
deleted file mode 100644
index e11c0a30d6..0000000000
--- a/arm_compute/core/CL/kernels/CLStackLayerKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLSTACKLAYERKERNEL_H
-#define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
-class CLStackLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLStackLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStackLayerKernel(const CLStackLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStackLayerKernel &operator=(const CLStackLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLStackLayerKernel(CLStackLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLStackLayerKernel &operator=(CLStackLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLStackLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in]  input       Input tensor. Data types supported: All.
-     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
-     *                         All tensors in the list must have the same shape
-     * @param[in]  num_tensors Number of tensors to stack
-     * @param[out] output      Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[in]  axis            The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input       Index of the input tensor in the list of tensors to stack.
-     *                             All tensors in the list must have the same shape
-     * @param[in]  num_tensors     Number of tensors to stack
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in] input       Input tensor info. Data types supported: All.
-     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
-     *                        All tensors in the list must have the same shape
-     * @param[in] num_tensors Number of tensors to stack
-     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSTACKLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
deleted file mode 100644
index ebe1b38878..0000000000
--- a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
-#define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the kernel to perform tensor strided slicing */
-class CLStridedSliceKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLStridedSliceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
-    /** Default destructor */
-    ~CLStridedSliceKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in] input            Source tensor. Data type supported: All.
-     * @param[in] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLTableLookupKernel.h b/arm_compute/core/CL/kernels/CLTableLookupKernel.h
deleted file mode 100644
index 24e333f164..0000000000
--- a/arm_compute/core/CL/kernels/CLTableLookupKernel.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-#define ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-class ICLLut;
-
-/** Interface for the kernel to perform table lookup calculations. */
-class CLTableLookupKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut    The input LUT. Data types supported: U8, S16.
-     * @param[out] output The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut             The input LUT. Data types supported: U8, S16.
-     * @param[out] output          The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTABLELOOKUPKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLThresholdKernel.h b/arm_compute/core/CL/kernels/CLThresholdKernel.h
deleted file mode 100644
index 3db48706a3..0000000000
--- a/arm_compute/core/CL/kernels/CLThresholdKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-#define ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the thresholding kernel.
- *
- */
-class CLThresholdKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  input       An input tensor. Data types supported: U8
-     * @param[out] output      The output tensor. Data types supported: U8.
-     * @param[in]  threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value value to set when the condition is not respected.
-     * @param[in]  true_value  value to set when the condition is respected.
-     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          The output tensor. Data types supported: U8.
-     * @param[in]  threshold       Threshold. When the threshold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value     value to set when the condition is not respected.
-     * @param[in]  true_value      value to set when the condition is respected.
-     * @param[in]  type            Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper           Upper threshold. Only used when the thresholding type is RANGE.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLTileKernel.h b/arm_compute/core/CL/kernels/CLTileKernel.h
deleted file mode 100644
index 68f3c929a6..0000000000
--- a/arm_compute/core/CL/kernels/CLTileKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTILEKERNEL_H
-#define ARM_COMPUTE_CLTILEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a Tile operation */
-class CLTileKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLTileKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLTileKernel(const CLTileKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLTileKernel &operator=(const CLTileKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLTileKernel(CLTileKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLTileKernel &operator=(CLTileKernel &&) = default;
-    /** Default destructor */
-    ~CLTileKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input     Source tensor. Data type supported: All.
-     * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     *                       Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
-     * @param[out] output    Destination tensor. Same as @p input
-     *
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: All.
-     * @param[in]  multiples       Contains the number of times the input tensor should be replicated on the given dimension.
-     *                             Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
-     * @param[out] output          Destination tensor. Same as @p input
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
-     *
-     * @param[in] input     Source tensor info. Data type supported: All.
-     * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     *                      Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
-     * @param[in] output    Destination tensor info. Same as @p input
-     *
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLTILEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h
deleted file mode 100644
index 09c9e3babf..0000000000
--- a/arm_compute/core/CL/kernels/CLTransposeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-#define ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class CLTransposeKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel
-     *
-     * @param[in] input  Input tensor. Data types supported: All.
-     * @param[in] output Output tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTRANSPOSEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
deleted file mode 100644
index e6b4209501..0000000000
--- a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
-#define ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the UpsampleLayer kernel on OpenCL. */
-class CLUpsampleLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLUpsampleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLUpsampleLayerKernel(const CLUpsampleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLUpsampleLayerKernel &operator=(const CLUpsampleLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLUpsampleLayerKernel(CLUpsampleLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLUpsampleLayerKernel &operator=(CLUpsampleLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLUpsampleLayerKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output            Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info              Contains stride information described in @ref Size2D.
-     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context   The compile context to be used.
-     * @param[in]  input             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output            Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info              Contains stride information described in @ref Size2D.
-     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLUpsampleLayerKernel
-     *
-     * @param[in] input             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output            Destination tensor info. Data types supported: same as @p input.
-     * @param[in] info              Contains  stride information described in @ref Size2D.
-     * @param[in] upsampling_policy Defines the policy to fill the intermediate pixels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _info;
-    DataLayout       _data_layout;
-    unsigned int     _num_elems_processed_per_iteration_input_x;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
deleted file mode 100644
index a21325e1c4..0000000000
--- a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-#define ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the warp affine kernel.*/
-class CLWarpAffineKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 2x3 of type float
-     *                    The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 2x3 of type float
-     *                             The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPAFFINEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
deleted file mode 100644
index bb1a018a2b..0000000000
--- a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-#define ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for the warp perspective kernel.*/
-class CLWarpPerspectiveKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
deleted file mode 100644
index 47e987b09b..0000000000
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref CLIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class CLWeightsReshapeKernel : public ICLKernel
-{
-public:
-    /** Constructor.*/
-    CLWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~CLWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                        and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                        dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                        Data types supported: Same as @p input
-     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                        Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                             and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases          The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                             dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                             @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output          The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                             Data types supported: Same as @p input
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
-     *
-     * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                       and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in] biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                       dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                       Data types supported: Same as @p input
-     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                       Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
deleted file mode 100644
index a39ccc2869..0000000000
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the width concatenate kernel of 2 tensors.
- *  The input1 and input2 tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate2TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate2TensorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel &operator=(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate2TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  input1 First input tensor. Data types supported: All.
-     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
deleted file mode 100644
index 0e0eae6e85..0000000000
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the width concatenate kernel of 4 tensors.
- *  All input tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate4TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate4TensorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel &operator=(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate4TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  input1 First input tensor. Data types supported: All.
-     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
-     * @param[in]  input3 Third input tensor. Data types supported: same as @p input1
-     * @param[in]  input4 Fourth input tensor. Data types supported: same as @p input1
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[in]  input3          Third input tensor. Data types supported: same as @p input1
-     * @param[in]  input4          Fourth input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] input3 Third tensor info. Data types supported: same as @p input1
-     * @param[in] input4 Fourth tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    const ICLTensor *_input3;
-    const ICLTensor *_input4;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
deleted file mode 100644
index ef5851fa9a..0000000000
--- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the width concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLWidthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel &operator=(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel(CLWidthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel &operator=(CLWidthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     width_offset    The offset on the X axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _width_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
deleted file mode 100644
index 5b2dc8cfc9..0000000000
--- a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd filter transform kernel. */
-class CLWinogradFilterTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradFilterTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradFilterTransformKernel(const CLWinogradFilterTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradFilterTransformKernel &operator=(const CLWinogradFilterTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradFilterTransformKernel(CLWinogradFilterTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradFilterTransformKernel &operator=(CLWinogradFilterTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLWinogradFilterTransformKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradFilterTransformKernel
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
deleted file mode 100644
index a305126f2d..0000000000
--- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Winograd input transform.*/
-class CLWinogradInputTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradInputTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradInputTransformKernel(const CLWinogradInputTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradInputTransformKernel &operator=(const CLWinogradInputTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradInputTransformKernel(CLWinogradInputTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradInputTransformKernel &operator=(CLWinogradInputTransformKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransformKernel
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    int              _num_tiles_x;
-    int              _num_tiles_y;
-    unsigned int     _step_z;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
deleted file mode 100644
index 512b352637..0000000000
--- a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd output transform kernel. */
-class CLWinogradOutputTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradOutputTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLWinogradOutputTransformKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias            Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation @ref ActivationLayerInfo. Only RELU, BOUNDED_RELU, LU_BOUNDED_RELU, LEAKY_RELU and SOFT_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    bool             _is_nhwc;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h b/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
deleted file mode 100644
index d0c4a9e417..0000000000
--- a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLYOLOLAYERKERNEL_H
-#define ARM_COMPUTE_CLYOLOLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the YOLO layer kernel that performs partial activation.
- *  For each box, activate only:
- *    - x and y position (channel 0 and 1 of each box)
- *    - objectiveness    (channel 4 of each box)
- *    - classes          (channel 5 to (classes - 5) of each box)
- */
-class CLYOLOLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLYOLOLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLYOLOLayerKernel(const CLYOLOLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLYOLOLayerKernel &operator=(const CLYOLOLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLYOLOLayerKernel(CLYOLOLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLYOLOLayerKernel &operator=(CLYOLOLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLYOLOLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                             of the activation function. Data types supported: F16/F32.
-     * @param[out]     output      Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info    Activation layer information.
-     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info        Activation layer information.
-     * @param[in]      num_classes     Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLYOLOLayerKernel
-     *
-     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                        of the activation function. Data types supported: F16/F32.
-     * @param[in] output      Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info    Activation layer information.
-     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLYOLOLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index f0f7754960..0000000000
--- a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
- */
-class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    ICLDepthwiseConvolutionLayer3x3Kernel()
-        : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDepthwiseConvolutionLayer3x3Kernel(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    ICLDepthwiseConvolutionLayer3x3Kernel(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
-
-protected:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    unsigned int     _conv_stride_y;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H */
diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
index c7b40baf22..f6f36596c4 100644
--- a/arm_compute/core/CPP/CPPKernels.h
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,8 @@
 
 /* Header regrouping all the CPP kernels */
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index d3f6fc944d..e5322bdcb1 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,111 +21,123 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPP_TYPES_H
-#define ARM_COMPUTE_CPP_TYPES_H
+#ifndef ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
+#define ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
 
 #include "arm_compute/core/Error.h"
 
-#include <array>
-#include <string>
-#include <vector>
+#include <memory>
 
 namespace arm_compute
 {
-/** CPU models - we only need to detect CPUs we have
- * microarchitecture-specific code for.
- *
- * Architecture features are detected via HWCAPs.
- */
-enum class CPUModel
+namespace cpuinfo
 {
-    GENERIC,
-    GENERIC_FP16,
-    GENERIC_FP16_DOT,
-    A53,
-    A55r0,
-    A55r1
-};
-
-/** Global memory policy.
- * The functions in the runtime will use different strategies based on the policy currently set.
- *
- * MINIMIZE will try to reduce the amount allocated by the functions at the expense of performance normally.
- * NORMAL won't try to save any memory and will favor speed over memory consumption
+struct CpuIsaInfo;
+} // namespace cpuinfo
+
+#define ARM_COMPUTE_CPU_MODEL_LIST \
+    X(GENERIC)                     \
+    X(GENERIC_FP16)                \
+    X(GENERIC_FP16_DOT)            \
+    X(A53)                         \
+    X(A55r0)                       \
+    X(A55r1)                       \
+    X(A35)                         \
+    X(A73)                         \
+    X(A76)                         \
+    X(A510)                        \
+    X(X1)                          \
+    X(V1)                          \
+    X(A64FX)                       \
+    X(N1)
+
+/** CPU models types
  *
+ * @note We only need to detect CPUs we have microarchitecture-specific code for.
+ * @note Architecture features are detected via HWCAPs.
  */
-enum class MemoryPolicy
+enum class CPUModel
 {
-    MINIMIZE,
-    NORMAL
+#define X(model) model,
+    ARM_COMPUTE_CPU_MODEL_LIST
+#undef X
 };
 
-/** Convert a cpumodel value to a string
- *
- * @param val CPUModel value to be converted
- *
- * @return String representing the corresponding CPUModel.
- */
-inline std::string cpu_model_to_string(CPUModel val)
-{
-    switch(val)
-    {
-        case CPUModel::GENERIC:
-        {
-            return std::string("GENERIC");
-        }
-        case CPUModel::GENERIC_FP16:
-        {
-            return std::string("GENERIC_FP16");
-        }
-        case CPUModel::GENERIC_FP16_DOT:
-        {
-            return std::string("GENERIC_FP16_DOT");
-        }
-        case CPUModel::A53:
-        {
-            return std::string("A53");
-        }
-        case CPUModel::A55r0:
-        {
-            return std::string("A55r0");
-        }
-        case CPUModel::A55r1:
-        {
-            return std::string("A55r1");
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Invalid CPUModel.");
-            return std::string("GENERIC");
-        }
-    }
-}
-
 class CPUInfo final
 {
-public:
-    /** Constructor */
+protected:
     CPUInfo();
+    ~CPUInfo();
 
-    /** Disable copy constructor and assignment operator to avoid copying the vector of CPUs each time
-     *  CPUInfo is initialized once in the IScheduler and ThreadInfo will get a pointer to it.
+public:
+    /** Access the KernelLibrary singleton.
+     * This method has been deprecated and will be removed in future releases
+     * @return The KernelLibrary instance.
      */
-    CPUInfo &operator=(const CPUInfo &cpuinfo) = delete;
-    CPUInfo(const CPUInfo &cpuinfo)            = delete;
-    CPUInfo &operator=(CPUInfo &&cpuinfo) = default;
-    CPUInfo(CPUInfo &&cpuinfo)            = default;
+    static CPUInfo &get();
+
+    /* Delete move and copy constructors and assignment operator
+    s */
+    CPUInfo(CPUInfo const &)            = delete; // Copy construct
+    CPUInfo(CPUInfo &&)                 = delete; // Move construct
+    CPUInfo &operator=(CPUInfo const &) = delete; // Copy assign
+    CPUInfo &operator=(CPUInfo &&)      = delete; // Move assign
 
     /** Checks if the cpu model supports fp16.
      *
-     * @return true of the cpu supports fp16, false otherwise
+     * @return true if the cpu supports fp16, false otherwise
      */
     bool has_fp16() const;
+    /** Checks if the cpu model supports bf16.
+     *
+     * @return true if the cpu supports bf16, false otherwise
+     */
+    bool has_bf16() const;
+    /** Checks if the cpu model supports bf16.
+     *
+     * @return true if the cpu supports bf16, false otherwise
+     */
+    bool has_svebf16() const;
     /** Checks if the cpu model supports dot product.
      *
-     * @return true of the cpu supports dot product, false otherwise
+     * @return true if the cpu supports dot product, false otherwise
      */
     bool has_dotprod() const;
+    /** Checks if the cpu model supports floating-point matrix multiplication.
+     *
+     * @return true if the cpu supports floating-point matrix multiplication, false otherwise
+     */
+    bool has_svef32mm() const;
+    /** Checks if the cpu model supports integer matrix multiplication.
+     *
+     * @return true if the cpu supports integer matrix multiplication, false otherwise
+     */
+    bool has_i8mm() const;
+    /** Checks if the cpu model supports integer matrix multiplication.
+     *
+     * @return true if the cpu supports integer matrix multiplication, false otherwise
+     */
+    bool has_svei8mm() const;
+    /** Checks if the cpu model supports sve.
+     *
+     * @return true if the cpu supports sve, false otherwise
+     */
+    bool has_sve() const;
+    /** Checks if the cpu model supports sve2.
+     *
+     * @return true if the cpu supports sve2, false otherwise
+     */
+    bool has_sve2() const;
+    /** Checks if the cpu model supports sme.
+     *
+     * @return true if the cpu supports sme, false otherwise
+     */
+    bool has_sme() const;
+    /** Checks if the cpu model supports sme2.
+     *
+     * @return true if the cpu supports sme2, false otherwise
+     */
+    bool has_sme2() const;
     /** Gets the cpu model for a given cpuid.
      *
      * @param[in] cpuid the id of the cpu core to be retrieved,
@@ -138,6 +150,11 @@ public:
      * @return Current thread's @ref CPUModel
      */
     CPUModel get_cpu_model() const;
+    /** Gets the current cpu's ISA information
+     *
+     * @return Current cpu's ISA information
+     */
+    cpuinfo::CpuIsaInfo get_isa() const;
     /** Gets the L1 cache size
      *
      * @return the size of the L1 cache
@@ -148,85 +165,41 @@ public:
      * @return the size of the L1 cache
      */
     unsigned int get_L2_cache_size() const;
-    /** Set the L1 cache size
-     *
-     * @param[in] size the new size to be set.
-     */
-    void set_L1_cache_size(unsigned int size);
-    /** Set the L2 cache size
-     *
-     * @param[in] size the new size to be set.
-     */
-    void set_L2_cache_size(unsigned int size);
-    /** Set fp16 support
-     *
-     * @param[in] fp16 whether the cpu supports fp16.
-     */
-    void set_fp16(const bool fp16);
-    /** Set dot product support
-     *
-     * @param[in] dotprod whether the cpu supports dot product.
-     */
-    void set_dotprod(const bool dotprod);
-    /** Set the cpumodel for a given cpu core
-     *
-     * @param[in] cpuid the id of the core to be set.
-     * @param[in] model the @ref CPUModel to be set.
-     */
-    void set_cpu_model(unsigned int cpuid, CPUModel model);
-    /** Set max number of cpus
-     *
-     * @param[in] cpu_count the number of CPUs in the system.
-     */
-    void set_cpu_num(unsigned int cpu_count);
-
     /** Return the maximum number of CPUs present
      *
      * @return Number of CPUs
      */
     unsigned int get_cpu_num() const;
-
-private:
-    std::vector<CPUModel> _percpu        = {};
-    bool                  _fp16          = false;
-    bool                  _dotprod       = false;
-    unsigned int          _L1_cache_size = 32768;
-    unsigned int          _L2_cache_size = 262144;
-};
-
-class MEMInfo final
-{
-public:
-    MEMInfo();
-
-    /** Return the total amount of RAM memory in the system expressed in KB.
+    /** Return the maximum number of CPUs present excluding the little cores
+     * in case of an Android device
      *
-     * @return Total memory
+     * @return Number of CPUs excluding little
      */
-    size_t get_total_in_kb() const;
-
-    static void set_policy(MemoryPolicy policy);
-    static MemoryPolicy get_policy();
+    unsigned int get_cpu_num_excluding_little() const;
+    /** Return whether the device has little, medium and big CPUs in case
+     * of an Android device, returns false otherwise
+     *
+     * @return Whether the device has little, medium and big CPUs
+     */
+    bool cpu_has_little_mid_big() const;
 
-    /** Common memory sizes expressed in Kb to avoid having them
-     *  duplicated throughout the code.
+    /** Return the vector length in bytes for sme2
+     *
+     * @return Vector length if sme2 is enabled, otherwise returns 0.
      */
-    static const size_t ONE_GB_IN_KB = { 1035842 };
-    static const size_t TWO_GB_IN_KB = { ONE_GB_IN_KB * 2 };
+    unsigned long get_sme2_vector_length() const;
 
 private:
-    size_t              _total;
-    size_t              _free;
-    size_t              _buffer;
-    static MemoryPolicy _policy;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Information about executing thread and CPU. */
 struct ThreadInfo
 {
-    int            thread_id{ 0 };
-    int            num_threads{ 1 };
-    const CPUInfo *cpu_info{ nullptr };
+    int            thread_id{0};
+    int            num_threads{1};
+    const CPUInfo *cpu_info{nullptr};
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPP_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index ec05af20bd..03967a536d 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,16 +25,21 @@
 #define ARM_COMPUTE_ICPPKERNEL_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 class Window;
+class ITensor;
 
 /** Common interface for all kernels implemented in C++ */
 class ICPPKernel : public IKernel
 {
 public:
+    static constexpr size_t default_mws = 1; /* Default minimum workload size value  - no impact */
+
     /** Default destructor */
     virtual ~ICPPKernel() = default;
 
@@ -51,8 +56,7 @@ public:
      */
     virtual void run(const Window &window, const ThreadInfo &info)
     {
-        ARM_COMPUTE_UNUSED(window);
-        ARM_COMPUTE_UNUSED(info);
+        ARM_COMPUTE_UNUSED(window, info);
         ARM_COMPUTE_ERROR("default implementation of legacy run() virtual member function invoked");
     }
 
@@ -69,6 +73,37 @@ public:
         run(window, info);
     }
 
+    /** Execute the kernel on the passed window
+     *
+     * @warning If is_parallelisable() returns false then the passed window must be equal to window()
+     *
+     * @note The window has to be a region within the window returned by the window() method
+     *
+     * @note The width of the window has to be a multiple of num_elems_processed_per_iteration().
+     *
+     * @param[in] tensors A vector containing the tensors to operate on.
+     * @param[in] window  Region on which to execute the kernel. (Must be a region of the window returned by window())
+     * @param[in] info    Info about executing thread and CPU.
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+    {
+        ARM_COMPUTE_UNUSED(tensors, window, info);
+    }
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return Minimum workload size for requested configuration.
+     */
+    virtual size_t get_mws(const CPUInfo &platform, size_t thread_count) const
+    {
+        ARM_COMPUTE_UNUSED(platform, thread_count);
+
+        return default_mws;
+    }
+
     /** Name of the kernel
      *
      * @return Kernel name
diff --git a/arm_compute/core/CPP/ICPPSimpleKernel.h b/arm_compute/core/CPP/ICPPSimpleKernel.h
deleted file mode 100644
index acdd054c0e..0000000000
--- a/arm_compute/core/CPP/ICPPSimpleKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICPPSIMPLEKERNEL_H
-#define ARM_COMPUTE_ICPPSIMPLEKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for simple C++ kernels having 1 tensor input and 1 tensor output */
-class ICPPSimpleKernel : public ICPPKernel
-{
-public:
-    /** Constructor */
-    ICPPSimpleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICPPSimpleKernel(const ICPPSimpleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICPPSimpleKernel &operator=(const ICPPSimpleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICPPSimpleKernel(ICPPSimpleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICPPSimpleKernel &operator=(ICPPSimpleKernel &&) = default;
-    /** Default destructor */
-    ~ICPPSimpleKernel() = default;
-
-protected:
-    /** Configure the kernel
-     *
-     * @param[in]  input                             Source tensor.
-     * @param[out] output                            Destination tensor.
-     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  border_size                       (Optional) Size of the border.
-     */
-    void configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
-    /** Static function to check if given info will lead to a valid configuration of @ref ICPPSimpleKernel.
-     *
-     * @param[in] input                             Source tensor info.
-     * @param[in] output                            Destination tensor info.
-     * @param[in] num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in] border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in] border_size                       (Optional) Size of the border.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
-                           bool border_undefined = false, const BorderSize &border_size = BorderSize());
-
-protected:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICPPSIMPLEKERNEL_H */
diff --git a/arm_compute/core/CPP/Validate.h b/arm_compute/core/CPP/Validate.h
deleted file mode 100644
index dfee9de86e..0000000000
--- a/arm_compute/core/CPP/Validate.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPP_VALIDATE_H
-#define ARM_COMPUTE_CPP_VALIDATE_H
-
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-/** Return an error if the data type of the passed tensor info is FP16 and FP16 support is not compiled in.
- *
- * @param[in] function    Function in which the error occurred.
- * @param[in] file        Name of the file where the error occurred.
- * @param[in] line        Line on which the error occurred.
- * @param[in] tensor_info Tensor info to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::F16,
-                                        function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    return Status {};
-}
-
-/** Return an error if the data type of the passed tensor info is BFLOAT16 and BFLOAT16 support is not compiled in.
- *
- * @param[in] function    Function in which the error occurred.
- * @param[in] file        Name of the file where the error occurred.
- * @param[in] line        Line on which the error occurred.
- * @param[in] tensor_info Tensor info to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-#if !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16))
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::BFLOAT16,
-                                        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
-#endif /* !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)) */
-    return Status {};
-}
-
-/** Return an error if the data type of the passed tensor is FP16 and FP16 support is not compiled in.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- * @param[in] tensor   Tensor to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
-    return Status{};
-}
-
-/** Return an error if the data type of the passed tensor is BFLOAT16 and BFLOAT16 support is not compiled in.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- * @param[in] tensor   Tensor to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
-    return Status{};
-}
-
-#define ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
-
-#define ARM_COMPUTE_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPP_VALIDATE_H */
diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
index 3fa83a6d6d..dd91595ea6 100644
--- a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,7 @@
 #ifndef ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 #define ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -62,11 +60,19 @@ public:
      * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[out] batch_splits_out (Optional) The batch splits output tensor [batch_size]. Data types supported: Same as @p scores_in
      * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as@p scores_in
-     * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
-    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    void configure(const ITensor        *scores_in,
+                   const ITensor        *boxes_in,
+                   const ITensor        *batch_splits_in,
+                   ITensor              *scores_out,
+                   ITensor              *boxes_out,
+                   ITensor              *classes,
+                   ITensor              *batch_splits_out = nullptr,
+                   ITensor              *keeps            = nullptr,
+                   ITensor              *keeps_size       = nullptr,
+                   const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -76,9 +82,9 @@ public:
     void run_nmslimit();
 
 private:
-    const ITensor *_scores_in;
-    const ITensor *_boxes_in;
-    const ITensor *_batch_splits_in;
+    const ITensor  *_scores_in;
+    const ITensor  *_boxes_in;
+    const ITensor  *_batch_splits_in;
     ITensor        *_scores_out;
     ITensor        *_boxes_out;
     ITensor        *_classes;
diff --git a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
deleted file mode 100644
index eeb6a65525..0000000000
--- a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H
-#define ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-/** Interface for CPP Images. */
-using IImage = ITensor;
-
-/** CPP kernel to perform corner candidates
- */
-class CPPCornerCandidatesKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CPPCornerCandidatesKernel";
-    }
-    /** Default constructor */
-    CPPCornerCandidatesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPCornerCandidatesKernel(const CPPCornerCandidatesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPCornerCandidatesKernel &operator=(const CPPCornerCandidatesKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CPPCornerCandidatesKernel(CPPCornerCandidatesKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CPPCornerCandidatesKernel &operator=(CPPCornerCandidatesKernel &&) = delete;
-    /** Default destructor */
-    ~CPPCornerCandidatesKernel() = default;
-
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input                 Source image (harris score). Format supported F32
-     * @param[out] output                Destination array of InternalKeypoint
-     * @param[out] num_corner_candidates Number of corner candidates
-     */
-    void configure(const IImage *input, InternalKeypoint *output, int32_t *num_corner_candidates);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    int32_t           *_num_corner_candidates;   /**< Number of corner candidates */
-    arm_compute::Mutex _corner_candidates_mutex; /**< Mutex to preventing race conditions */
-    const IImage      *_input;                   /**< Source image - Harris score */
-    InternalKeypoint *_output;                   /**< Array of NEInternalKeypoint */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H */
diff --git a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
deleted file mode 100644
index cf8e4f00b9..0000000000
--- a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H
-#define ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-/** CPP kernel to perform in-place computation of euclidean distance on IDetectionWindowArray
- *
- * @note This kernel is meant to be used alongside HOG or other object detection algorithms to perform a non-maxima suppression on a
- *       IDetectionWindowArray
- */
-class CPPDetectionWindowNonMaximaSuppressionKernel : public ICPPKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CPPDetectionWindowNonMaximaSuppressionKernel";
-    }
-    /** Default constructor */
-    CPPDetectionWindowNonMaximaSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPDetectionWindowNonMaximaSuppressionKernel(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CPPDetectionWindowNonMaximaSuppressionKernel(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
-    /** Initialise the kernel's input, output and the euclidean minimum distance
-     *
-     * @attention: If @ref IDetectionWindowArray is passed to the kernel, the map() and unmap() methods @ref IDetectionWindowArray must be called respectively before and after
-     *             the run() method of @ref CPPDetectionWindowNonMaximaSuppressionKernel
-     *
-     * @param[in, out] input_output Input/Output array of @ref DetectionWindow
-     * @param[in]      min_distance Radial Euclidean distance for non-maxima suppression
-     */
-    void configure(IDetectionWindowArray *input_output, float min_distance);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    IDetectionWindowArray *_input_output;
-    float                  _min_distance;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H */
diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
index cb416af070..d1f7f8670f 100644
--- a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 #define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -65,7 +64,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    void configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size, const float score_threshold, const float iou_threshold);
+    void configure(const ITensor *input_bboxes,
+                   const ITensor *input_scores,
+                   ITensor       *output_indices,
+                   unsigned int   max_output_size,
+                   const float    score_threshold,
+                   const float    iou_threshold);
 
     /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppressionKernel
      *
@@ -77,8 +81,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    static Status validate(const ITensorInfo *input_bboxes, const ITensorInfo *input_scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                           const float score_threshold, const float iou_threshold);
+    static Status validate(const ITensorInfo *input_bboxes,
+                           const ITensorInfo *input_scores,
+                           const ITensorInfo *output_indices,
+                           unsigned int       max_output_size,
+                           const float        score_threshold,
+                           const float        iou_threshold);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
index e75152f4ea..d141c2fb70 100644
--- a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,15 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to permute. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input
      * @param[in]  perm   Permutation vector
      */
     void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
      *
-     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32
-     * @param[in] output The output tensor. Data types supported: Same as @p input
+     * @param[in] input  The input tensor to permute. Data types supported: All.
+     * @param[in] output The output tensor. Data types supported: same as @p input
      * @param[in] perm   Permutation vector
      *
      * @return a status
diff --git a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
deleted file mode 100644
index d127ef8d8a..0000000000
--- a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H
-#define ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-#include "arm_compute/core/IArray.h"
-
-#include <cstdint>
-#include <mutex>
-
-namespace arm_compute
-{
-/** CPP kernel to perform sorting and euclidean distance */
-class CPPSortEuclideanDistanceKernel : public ICPPKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CPPSortEuclideanDistanceKernel";
-    }
-    /** Default constructor */
-    CPPSortEuclideanDistanceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPSortEuclideanDistanceKernel(const CPPSortEuclideanDistanceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPSortEuclideanDistanceKernel &operator=(const CPPSortEuclideanDistanceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CPPSortEuclideanDistanceKernel(CPPSortEuclideanDistanceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CPPSortEuclideanDistanceKernel &operator=(CPPSortEuclideanDistanceKernel &&) = default;
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in,out] in_out                Input internal keypoints. Marked as out as the kernel writes 0 in the strength member.
-     * @param[out]    output                Output keypoints.
-     * @param[in]     num_corner_candidates Pointer to the number of corner candidates in the input array
-     * @param[in]     min_distance          Radial Euclidean distance to use
-     */
-    void configure(InternalKeypoint *in_out, IKeyPointArray *output, const int32_t *num_corner_candidates, float min_distance);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    const int32_t    *_num_corner_candidates; /**< Number of corner candidates */
-    float             _min_distance;          /**< Radial Euclidean distance */
-    InternalKeypoint *_in_out;                /**< Source array of InternalKeypoint */
-    IKeyPointArray   *_output;                /**< Destination array of IKeyPointArray */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H */
diff --git a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
index 4b9bfdd3c9..7326a10e2f 100644
--- a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  predictions A batch_size x classes tensor. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: S32
+     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: U32
      * @param[out] output      Computed precision at @p k as a bool 1D tensor. Data types supported: U8
      * @param[in]  k           Number of top elements to look at for computing precision.
      */
@@ -63,13 +63,14 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CPPTopKVKernel
      *
      * @param[in] predictions A batch_size x classes tensor info. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: S32
+     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: U32
      * @param[in] output      Computed precision at @p k as a bool 1D tensor info. Data types supported: U8
      * @param[in] k           Number of top elements to look at for computing precision.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
index 9fbc9b697c..dd7e07c390 100644
--- a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,8 +55,8 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to upsample. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input.
      * @param[in]  info   Padding info.
      */
     void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
index 78ca5250ab..d1240bb10a 100644
--- a/arm_compute/core/Coordinates.h
+++ b/arm_compute/core/Coordinates.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,8 +42,7 @@ public:
      * @param[in] coords Values to initialize the dimensions.
      */
     template <typename... Ts>
-    constexpr Coordinates(Ts... coords)
-        : Dimensions{ coords... }
+    constexpr Coordinates(Ts... coords) : Dimensions{coords...}
     {
     }
     /** Allow instances of this class to be copy constructed */
@@ -57,5 +56,5 @@ public:
     /** Default destructor */
     ~Coordinates() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_COORDINATES_H*/
diff --git a/arm_compute/core/CoreTypes.h b/arm_compute/core/CoreTypes.h
new file mode 100644
index 0000000000..1a9db1937c
--- /dev/null
+++ b/arm_compute/core/CoreTypes.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_CORE_CORETYPES
+#define ACL_ARM_COMPUTE_CORE_CORETYPES
+
+#include "arm_compute/core/Strides.h"
+
+#include "support/Half.h"
+
+/** CoreTypes.h groups together essential small types that are used across functions */
+
+namespace arm_compute
+{
+/** 16-bit floating point type */
+using half = half_float::half;
+/** Permutation vector */
+using PermutationVector = Strides;
+
+/** Available channels */
+enum class Channel
+{
+    UNKNOWN, /** Unknown channel format */
+    C0,      /**< First channel (used by formats with unknown channel types). */
+    C1,      /**< Second channel (used by formats with unknown channel types). */
+    C2,      /**< Third channel (used by formats with unknown channel types). */
+    C3,      /**< Fourth channel (used by formats with unknown channel types). */
+    R,       /**< Red channel. */
+    G,       /**< Green channel. */
+    B,       /**< Blue channel. */
+    A,       /**< Alpha channel. */
+    Y,       /**< Luma channel. */
+    U,       /**< Cb/U channel. */
+    V        /**< Cr/V/Value channel. */
+};
+
+/** Image colour formats */
+enum class Format
+{
+    UNKNOWN,  /**< Unknown image format */
+    U8,       /**< 1 channel, 1 U8 per channel */
+    S16,      /**< 1 channel, 1 S16 per channel */
+    U16,      /**< 1 channel, 1 U16 per channel */
+    S32,      /**< 1 channel, 1 S32 per channel */
+    U32,      /**< 1 channel, 1 U32 per channel */
+    S64,      /**< 1 channel, 1 S64 per channel */
+    U64,      /**< 1 channel, 1 U64 per channel */
+    BFLOAT16, /**< 16-bit brain floating-point number */
+    F16,      /**< 1 channel, 1 F16 per channel */
+    F32,      /**< 1 channel, 1 F32 per channel */
+    UV88,     /**< 2 channel, 1 U8 per channel */
+    RGB888,   /**< 3 channels, 1 U8 per channel */
+    RGBA8888, /**< 4 channels, 1 U8 per channel */
+    YUV444,   /**< A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
+    YUYV422,  /**< A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
+    NV12,     /**< A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
+    NV21,     /**< A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
+    IYUV,     /**< A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
+    UYVY422   /**< A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
+};
+
+/** Available data types */
+enum class DataType
+{
+    UNKNOWN,            /**< Unknown data type */
+    U8,                 /**< unsigned 8-bit number */
+    S8,                 /**< signed 8-bit number */
+    QSYMM8,             /**< quantized, symmetric fixed-point 8-bit number */
+    QASYMM8,            /**< quantized, asymmetric fixed-point 8-bit number unsigned */
+    QASYMM8_SIGNED,     /**< quantized, asymmetric fixed-point 8-bit number signed */
+    QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */
+    U16,                /**< unsigned 16-bit number */
+    S16,                /**< signed 16-bit number */
+    QSYMM16,            /**< quantized, symmetric fixed-point 16-bit number */
+    QASYMM16,           /**< quantized, asymmetric fixed-point 16-bit number */
+    U32,                /**< unsigned 32-bit number */
+    S32,                /**< signed 32-bit number */
+    U64,                /**< unsigned 64-bit number */
+    S64,                /**< signed 64-bit number */
+    BFLOAT16,           /**< 16-bit brain floating-point number */
+    F16,                /**< 16-bit floating-point number */
+    F32,                /**< 32-bit floating-point number */
+    F64,                /**< 64-bit floating-point number */
+    SIZET               /**< size_t */
+};
+
+/** [DataLayout enum definition] **/
+
+/** Supported tensor data layouts */
+enum class DataLayout
+{
+    UNKNOWN, /**< Unknown data layout */
+    NCHW,    /**< Num samples, channels, height, width */
+    NHWC,    /**< Num samples, height, width, channels */
+    NCDHW,   /**< Num samples, channels, depth, height, width */
+    NDHWC    /**< Num samples, depth, height, width, channels */
+};
+/** [DataLayout enum definition] **/
+
+/** Supported tensor data layout dimensions */
+enum class DataLayoutDimension
+{
+    CHANNEL, /**< channel */
+    HEIGHT,  /**< height */
+    WIDTH,   /**< width */
+    DEPTH,   /**< depth */
+    BATCHES  /**< batches */
+};
+
+/** Dimension rounding type when down-scaling on CNNs
+ * @note Used in pooling and convolution layer
+ */
+enum class DimensionRoundingType
+{
+    FLOOR, /**< Floor rounding */
+    CEIL   /**< Ceil rounding */
+};
+
+class PadStrideInfo
+{
+public:
+    /** Constructor
+     *
+     * @param[in] stride_x (Optional) Stride, in elements, across x. Defaults to 1.
+     * @param[in] stride_y (Optional) Stride, in elements, across y. Defaults to 1.
+     * @param[in] pad_x    (Optional) Padding, in elements, across x. Defaults to 0.
+     * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
+     * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR.
+     */
+    PadStrideInfo(unsigned int          stride_x = 1,
+                  unsigned int          stride_y = 1,
+                  unsigned int          pad_x    = 0,
+                  unsigned int          pad_y    = 0,
+                  DimensionRoundingType round    = DimensionRoundingType::FLOOR)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad_left(pad_x),
+          _pad_top(pad_y),
+          _pad_right(pad_x),
+          _pad_bottom(pad_y),
+          _round_type(round)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] stride_x   Stride, in elements, across x.
+     * @param[in] stride_y   Stride, in elements, across y.
+     * @param[in] pad_left   Padding across x on the left, in elements.
+     * @param[in] pad_right  Padding across x on the right, in elements.
+     * @param[in] pad_top    Padding across y on the top, in elements.
+     * @param[in] pad_bottom Padding across y on the bottom, in elements.
+     * @param[in] round      Dimensions rounding.
+     */
+    PadStrideInfo(unsigned int          stride_x,
+                  unsigned int          stride_y,
+                  unsigned int          pad_left,
+                  unsigned int          pad_right,
+                  unsigned int          pad_top,
+                  unsigned int          pad_bottom,
+                  DimensionRoundingType round)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad_left(pad_left),
+          _pad_top(pad_top),
+          _pad_right(pad_right),
+          _pad_bottom(pad_bottom),
+          _round_type(round)
+    {
+    }
+    /** Get the stride.
+     *
+     * @return a pair: stride x, stride y.
+     */
+    std::pair<unsigned int, unsigned int> stride() const
+    {
+        return _stride;
+    }
+    /** Check whether the padding is symmetric.
+     *
+     * @return True if the padding is symmetric.
+     */
+    bool padding_is_symmetric() const
+    {
+        return (_pad_left == _pad_right) && (_pad_top == _pad_bottom);
+    }
+    /** Get the padding.
+     *
+     * @note This should only be used when the padding is symmetric.
+     *
+     * @return a pair: padding left/right, padding top/bottom
+     */
+    std::pair<unsigned int, unsigned int> pad() const
+    {
+        //this accessor should be used only when padding is symmetric
+        ARM_COMPUTE_ERROR_ON(!padding_is_symmetric());
+        return std::make_pair(_pad_left, _pad_top);
+    }
+
+    /** Get the left padding */
+    unsigned int pad_left() const
+    {
+        return _pad_left;
+    }
+    /** Get the right padding */
+    unsigned int pad_right() const
+    {
+        return _pad_right;
+    }
+    /** Get the top padding */
+    unsigned int pad_top() const
+    {
+        return _pad_top;
+    }
+    /** Get the bottom padding */
+    unsigned int pad_bottom() const
+    {
+        return _pad_bottom;
+    }
+
+    /** Get the rounding type */
+    DimensionRoundingType round() const
+    {
+        return _round_type;
+    }
+
+    /** Check whether this has any padding */
+    bool has_padding() const
+    {
+        return (_pad_left != 0 || _pad_top != 0 || _pad_right != 0 || _pad_bottom != 0);
+    }
+
+private:
+    std::pair<unsigned int, unsigned int> _stride;
+    unsigned int                          _pad_left;
+    unsigned int                          _pad_top;
+    unsigned int                          _pad_right;
+    unsigned int                          _pad_bottom;
+
+    DimensionRoundingType _round_type;
+};
+
+/** Memory layouts for the weights tensor.
+ *
+ * * UNSPECIFIED is used to select kernels that do not run in
+ *    variable weights mode.
+ *
+ * * ANY is used to query the kernel database to retrieve any of the
+ *   kernels that runs in variable weights mode. Once a kernel is
+ *   found, the specific format expected by the kernel can be
+ *   retrieved by the user for reordering the weights tensor
+ *   accordingly.
+ *
+ * The other values OHWIo{interleave_by}i{block_by} describe the
+ * memory layout of a 4D tensor with layout OHWI that has been
+ * transformed into a 4D tensor with dimensions O'HWI' where:
+ *
+ * O' = first multiple of {interleave_by} s.t. O<=O'
+ * I' = first multiple of {block_by} s.t. I<=I'
+ *
+ * The total size of the dst tensor is O' x H x W x I'
+ *
+ * The access function of the tensor with layout
+ * OHWIo{interleave_by}i{block_by} and size O'HWI' is a 6-parameter
+ * access function, where the 6 parameters are computed as follows:
+ *
+ * x5 = floor(o/{interleave_by}) RANGE [0, O'/{interleave_by} -1] SIZE: O'/{interleave_by}
+ *
+ * x4 = h                        RANGE [0, H-1]                   SIZE: H
+ * x3 = w                        RANGE [0, W-1]                   SIZE: W
+ * x2 = floor(i/{block_by})      RANGE [0, I'/{block_by} -1]      SIZE: I'/{block_by}
+ * x1 = o%{interleave_by}        RANGE [0, {interleave_by} -1]    SIZE: {interleave_by}
+ * x0 = i%{block_by}             RANGE [0, {block_by} -1]         SIZE: {block_by}
+ *                                                          TOTAL SIZE: O' * H * W * I'
+ *
+ *        4D                       6D
+ * -----------------   -----------------------------------
+ * value(o, h, w, i) =   x5 * H * W * I' * {interleave_by}
+ *                     + x4 * W * I' * {interleave_by}
+ *                     + x3 * I' * {interleave_by}
+ *                     + x2 * {interleave_by} * {block_by}
+ *                     + x1 * {block_by}
+ *                     + x0
+ *
+ * Notice that in arm_gemm the 4D tensor of dimension O'HWI' created
+ * for the OHWIo{interleave_by}i{block_by} format is in reality seen
+ * as a 2D tensor, where the number of rows is O'/{interleave_by}
+ * and the number of columns is {interleave_by} * H * W * I'.
+ *
+ * The postfix *_bf16 is for the memory layout needed for the
+ * fast-mode kernels, in which the weights are passed in bfloat16
+ * format.
+ */
+enum class WeightFormat
+{
+    UNSPECIFIED    = 0x1,
+    ANY            = 0x2,
+    OHWI           = 0x100100,
+    OHWIo2         = 0x100200,
+    OHWIo4         = 0x100400,
+    OHWIo8         = 0x100800,
+    OHWIo16        = 0x101000,
+    OHWIo32        = 0x102000,
+    OHWIo64        = 0x104000,
+    OHWIo128       = 0x108000,
+    OHWIo4i2       = 0x200400,
+    OHWIo4i2_bf16  = 0x200410,
+    OHWIo8i2       = 0x200800,
+    OHWIo8i2_bf16  = 0x200810,
+    OHWIo16i2      = 0x201000,
+    OHWIo16i2_bf16 = 0x201010,
+    OHWIo32i2      = 0x202000,
+    OHWIo32i2_bf16 = 0x202010,
+    OHWIo64i2      = 0x204000,
+    OHWIo64i2_bf16 = 0x204010,
+    OHWIo4i4       = 0x400400,
+    OHWIo4i4_bf16  = 0x400410,
+    OHWIo8i4       = 0x400800,
+    OHWIo8i4_bf16  = 0x400810,
+    OHWIo16i4      = 0x401000,
+    OHWIo16i4_bf16 = 0x401010,
+    OHWIo32i4      = 0x402000,
+    OHWIo32i4_bf16 = 0x402010,
+    OHWIo64i4      = 0x404000,
+    OHWIo64i4_bf16 = 0x404010,
+    OHWIo2i8       = 0x800200,
+    OHWIo4i8       = 0x800400,
+    OHWIo8i8       = 0x800800,
+    OHWIo16i8      = 0x801000,
+    OHWIo32i8      = 0x802000,
+    OHWIo64i8      = 0x804000
+};
+
+} // namespace arm_compute
+#endif /* ACL_ARM_COMPUTE_CORE_CORETYPES */
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
index fbaef3a8f0..bb8692d70a 100644
--- a/arm_compute/core/Dimensions.h
+++ b/arm_compute/core/Dimensions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include <algorithm>
 #include <array>
 #include <functional>
+#include <limits>
 #include <numeric>
 
 namespace arm_compute
@@ -49,8 +50,7 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    explicit Dimensions(Ts... dims)
-        : _id{ { static_cast<T>(dims)... } }, _num_dimensions{ sizeof...(dims) }
+    explicit Dimensions(Ts... dims) : _id{{static_cast<T>(dims)...}}, _num_dimensions{sizeof...(dims)}
     {
     }
 
@@ -68,14 +68,19 @@ public:
 
     /** Accessor to set the value of one of the dimensions.
      *
-     * @param[in] dimension Dimension for which the value is set.
-     * @param[in] value     Value to be set for the dimension.
+     * @param[in] dimension         Dimension for which the value is set.
+     * @param[in] value             Value to be set for the dimension.
+     * @param[in] increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of dimensions (e.g. for Coordinates), false otherwise (e.g. for TensorShapes)
      */
-    void set(size_t dimension, T value)
+    void set(size_t dimension, T value, bool increase_dim_unit = true)
     {
         ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
-        _id[dimension]  = value;
-        _num_dimensions = std::max(_num_dimensions, dimension + 1);
+        _id[dimension] = value;
+        // Don't increase the number of dimensions if the new dimension is 1
+        if (increase_dim_unit || value != 1)
+        {
+            _num_dimensions = std::max(_num_dimensions, dimension + 1);
+        }
     }
     /** Alias to access the size of the first dimension */
     T x() const
@@ -92,6 +97,21 @@ public:
     {
         return _id[2];
     }
+    /** Increments the given dimension by a step size, avoiding overflows
+     *
+     * @note Precondition: dim < _num_dimensions
+     *
+     * @param[in] dim  Dimension to increment.
+     * @param[in] step Step to increment @p dim by.
+     */
+    void increment(size_t dim, T step = 1)
+    {
+        ARM_COMPUTE_ERROR_ON(dim >= _num_dimensions);
+        if ((std::numeric_limits<T>::max() - _id[dim]) >= step)
+        {
+            _id[dim] += step;
+        }
+    }
     /** Generic accessor to get the size of any dimension
      *
      * @note Precondition: dimension < Dimensions::num_max_dimensions
@@ -141,7 +161,7 @@ public:
 
         const size_t last = std::min(_num_dimensions, first + n);
 
-        if(last > (first + 1))
+        if (last > (first + 1))
         {
             // Collapse dimensions into the first
             _id[first] = std::accumulate(&_id[first], &_id[last], 1, std::multiplies<T>());
@@ -175,7 +195,7 @@ public:
     void remove(size_t idx)
     {
         ARM_COMPUTE_ERROR_ON(_num_dimensions < 1);
-        if(idx >= _num_dimensions)
+        if (idx >= _num_dimensions)
         {
             return;
         }
@@ -241,7 +261,7 @@ protected:
     ~Dimensions() = default;
 
     std::array<T, num_max_dimensions> _id;
-    size_t _num_dimensions{ 0 };
+    size_t                            _num_dimensions{0};
 };
 
 /** Check that given dimensions are equal.
@@ -268,5 +288,5 @@ inline bool operator!=(const Dimensions<T> &lhs, const Dimensions<T> &rhs)
 {
     return !(lhs == rhs);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_DIMENSIONS_H*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index dd3e8889bc..7a7033805a 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019, 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,8 +53,7 @@ class Status
 {
 public:
     /** Default Constructor **/
-    Status()
-        : _code(ErrorCode::OK), _error_description(" ")
+    Status() : _code(ErrorCode::OK), _error_description(" ")
     {
     }
     /** Default Constructor
@@ -101,7 +100,7 @@ public:
     /** Throws a runtime exception in case it contains a valid error status */
     void throw_if_error() const
     {
-        if(!bool(*this))
+        if (!bool(*this))
         {
             internal_throw_on_error();
         }
@@ -119,7 +118,7 @@ private:
 /** Creates an error containing the error message
  *
  * @param[in] error_code Error code
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  *
  * @return status containing the error
  */
@@ -131,7 +130,7 @@ Status create_error(ErrorCode error_code, std::string msg);
  * @param[in] func       Function in which the error occurred.
  * @param[in] file       File in which the error occurred.
  * @param[in] line       Line in which the error occurred.
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  *
  * @return status containing the error
  */
@@ -141,7 +140,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] err Error status
  */
 [[noreturn]] void throw_error(Status err);
-}
+} // namespace arm_compute
 /** To avoid unused variables warnings
  *
  * This is useful if for example a variable is only used
@@ -156,7 +155,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] error_code Error code.
  * @param[in] msg        Message to encapsulate.
  */
-#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
+#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) \
+    arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
 
 /** Creates an error on location with a given message
  *
@@ -164,9 +164,10 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] func       Function in which the error occurred.
  * @param[in] file       File in which the error occurred.
  * @param[in] line       Line in which the error occurred.
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) arm_compute::create_error_msg(error_code, func, file, line, msg)
+#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) \
+    arm_compute::create_error_msg(error_code, func, file, line, msg)
 
 /** Creates an error on location with a given message. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -178,14 +179,14 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg        Error description message format.
  * @param[in] ...        List of arguments matching the format description.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)          \
-    do                                                                                    \
-    {                                                                                     \
-        std::array<char, 512> out{ 0 };                                                   \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
-        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);             \
-        arm_compute::create_error(error_code, std::string(out.data()));                   \
-    } while(false)
+#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)                            \
+    do                                                                                                      \
+    {                                                                                                       \
+        std::array<char, 512> out{0};                                                                       \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+        arm_compute::create_error(error_code, std::string(out.data()));                                     \
+    } while (false)
 
 /** An error is returned with the given description.
  *
@@ -195,7 +196,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
     do                                                                                       \
     {                                                                                        \
         return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Checks if a status contains an error and returns it
  *
@@ -204,18 +205,18 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ON_ERROR(status) \
     do                                      \
     {                                       \
-        if(!bool(status))                   \
+        const auto s = status;              \
+        if (!bool(s))                       \
         {                                   \
-            return status;                  \
+            return s;                       \
         }                                   \
-    } while(false)
+    } while (false)
 
 /** Checks if an error value is valid if not throws an exception with the error
  *
  * @param[in] error Error value to check.
  */
-#define ARM_COMPUTE_THROW_ON_ERROR(error) \
-    error.throw_if_error();
+#define ARM_COMPUTE_THROW_ON_ERROR(error) error.throw_if_error();
 
 /** If the condition is true, an error is returned. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -227,28 +228,29 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(cond, msg, ...)                                                   \
     do                                                                                                        \
     {                                                                                                         \
-        if(cond)                                                                                              \
+        if (cond)                                                                                             \
         {                                                                                                     \
-            std::array<char, 512> out{ 0 };                                                                   \
+            std::array<char, 512> out{0};                                                                     \
             int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", __func__, __FILE__, __LINE__);     \
             snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                             \
             return arm_compute::create_error(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
         }                                                                                                     \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Error description message
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                          \
-    do                                                                                                                      \
-    {                                                                                                                       \
-        if(cond)                                                                                                            \
-        {                                                                                                                   \
-            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, msg); \
-        }                                                                                                                   \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                    \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if (cond)                                                                                                     \
+        {                                                                                                             \
+            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, \
+                                                 msg);                                                                \
+        }                                                                                                             \
+    } while (false)
 
 /** If the condition is true, an error is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -260,17 +262,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                \
-    do                                                                                           \
-    {                                                                                            \
-        if(cond)                                                                                 \
-        {                                                                                        \
-            std::array<char, 512> out{ 0 };                                                      \
-            int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);    \
-            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                \
-            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
-        }                                                                                        \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                               \
+    do                                                                                                          \
+    {                                                                                                           \
+        if (cond)                                                                                               \
+        {                                                                                                       \
+            std::array<char, 512> out{0};                                                                       \
+            int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data()));                \
+        }                                                                                                       \
+    } while (false)
 
 /** If the condition is true, an error is thrown.
  *
@@ -283,18 +285,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, msg)                           \
     do                                                                                             \
     {                                                                                              \
-        if(cond)                                                                                   \
+        if (cond)                                                                                  \
         {                                                                                          \
             return arm_compute::create_error_msg(ErrorCode::RUNTIME_ERROR, func, file, line, msg); \
         }                                                                                          \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON(cond) \
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_RETURN_ERROR_ON(cond) ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true, an error is returned
  *
@@ -313,11 +314,12 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] line Line in which the error occurred.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                                         \
-    do                                                                                                                         \
-    {                                                                                                                          \
-        arm_compute::throw_error(arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
-    } while(false)
+#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                    \
+    do                                                                                                    \
+    {                                                                                                     \
+        arm_compute::throw_error(                                                                         \
+            arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -331,11 +333,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, ...)                                                        \
     do                                                                                                                 \
     {                                                                                                                  \
-        std::array<char, 512> out{ 0 };                                                                                \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);                              \
+        std::array<char, 512> out{0};                                                                                  \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);            \
         snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                                          \
         arm_compute::throw_error(arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data()))); \
-    } while(false)
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -360,7 +362,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
+#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) \
+    ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
 
 /** Print the given message then throw an std::runtime_error.
  *
@@ -379,11 +382,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG(cond, msg) \
     do                                     \
     {                                      \
-        if(cond)                           \
+        if (cond)                          \
         {                                  \
             ARM_COMPUTE_ERROR(msg);        \
         }                                  \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and program exits. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -395,27 +398,25 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, ...)  \
     do                                               \
     {                                                \
-        if(cond)                                     \
+        if (cond)                                    \
         {                                            \
             ARM_COMPUTE_ERROR_VAR(msg, __VA_ARGS__); \
         }                                            \
-    } while(false)
+    } while (false)
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Checks if a status value is valid if not throws an exception with the error
  *
  * @param[in] status Status value to check.
  */
-#define ARM_COMPUTE_ERROR_THROW_ON(status) \
-    status.throw_if_error()
+#define ARM_COMPUTE_ERROR_THROW_ON(status) status.throw_if_error()
 
 /** If the condition is true, the given message is printed and an exception is thrown
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) \
-    ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
 
 /** If the condition is true, the given message is printed and an exception is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -424,8 +425,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) \
-    ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
 
 /** If the condition is true, the given message is printed and an exception is thrown.
  *
@@ -438,11 +438,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...)     \
     do                                                                \
     {                                                                 \
-        if(cond)                                                      \
+        if (cond)                                                     \
         {                                                             \
             ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, __VA_ARGS__); \
         }                                                             \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and an exception is thrown, otherwise value is returned
  *
@@ -463,8 +463,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  *
  * @param[in] cond Condition to evaluate.
  */
-#define ARM_COMPUTE_ERROR_ON(cond) \
-    ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_ERROR_ON(cond) ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true then an error message is printed and an exception thrown
  *
diff --git a/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h b/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h
deleted file mode 100644
index 9706c9b3a6..0000000000
--- a/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCCORERUNTIME_CONTEXT_H
-#define ARM_COMPUTE_GCCORERUNTIME_CONTEXT_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class GCKernelLibrary;
-
-/** Core runtime context for OpenGL ES */
-class GCCoreRuntimeContext final
-{
-public:
-    /** Legacy constructor */
-    GCCoreRuntimeContext();
-
-    /** Constructor */
-    GCCoreRuntimeContext(GCKernelLibrary *kernel_lib);
-    /** Destructor */
-    ~GCCoreRuntimeContext() = default;
-    /** Default copy constructor */
-    GCCoreRuntimeContext(const GCCoreRuntimeContext &) = default;
-    /** Default move constructor */
-    GCCoreRuntimeContext(GCCoreRuntimeContext &&) = default;
-    /** Default copy assignment */
-    GCCoreRuntimeContext &operator=(const GCCoreRuntimeContext &) = default;
-    /** Default move assignment operator */
-    GCCoreRuntimeContext &operator=(GCCoreRuntimeContext &&) = default;
-    /** Kernel Library accessor
-     *
-     * @return The kernel library instance used by the core context
-     */
-    GCKernelLibrary *kernel_library() const;
-
-private:
-    GCKernelLibrary *_kernel_lib{ nullptr };
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCCORERUNTIME_CONTEXT_H */
diff --git a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
deleted file mode 100644
index 0f6daf786b..0000000000
--- a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCKERNELLIBRARY_H
-#define ARM_COMPUTE_GCKERNELLIBRARY_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Utils.h"
-
-#include <map>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace arm_compute
-{
-/** GCProgram class */
-class GCProgram final
-{
-public:
-    /** Default constructor. */
-    GCProgram();
-    /** Construct program from source file.
-     *
-     * @param[in] name   Program name.
-     * @param[in] source Program source.
-     */
-    GCProgram(std::string name, std::string source);
-    /** Default Copy Constructor. */
-    GCProgram(const GCProgram &) = default;
-    /** Default Move Constructor. */
-    GCProgram(GCProgram &&) = default;
-    /** Default copy assignment operator */
-    GCProgram &operator=(const GCProgram &) = default;
-    /** Default move assignment operator */
-    GCProgram &operator=(GCProgram &&) = default;
-    /** Returns program name.
-     *
-     * @return Program's name.
-     */
-    std::string name() const
-    {
-        return _name;
-    }
-    /** Link program.
-     *
-     * @param[in] shader Shader used to link program.
-     *
-     * @return linked program id .
-     */
-    GLuint link_program(GLuint shader);
-    /** Compile shader.
-     *
-     * @param[in] build_options Shader build options.
-     *
-     * @return GLES shader object.
-     */
-    GLuint compile_shader(const std::string &build_options);
-
-private:
-    std::string _name;   /**< Program name. */
-    std::string _source; /**< Source code for the program. */
-};
-
-/** GCKernel class */
-class GCKernel final
-{
-public:
-    /** Default Constructor. */
-    GCKernel();
-    /** Default Copy Constructor. */
-    GCKernel(const GCKernel &) = default;
-    /** Default Move Constructor. */
-    GCKernel(GCKernel &&) = default;
-    /** Default copy assignment operator */
-    GCKernel &operator=(const GCKernel &) = default;
-    /** Default move assignment operator */
-    GCKernel &operator=(GCKernel &&) = default;
-    /** Constructor.
-     *
-     * @param[in] name    Kernel name.
-     * @param[in] program Built program.
-     */
-    GCKernel(std::string name, GLuint program);
-    /** Destructor.
-     */
-    ~GCKernel();
-    /** Returns kernel name.
-     *
-     * @return Kernel's name.
-     */
-    std::string name() const
-    {
-        return _name;
-    }
-    /** Get program id.
-     *
-     * @return program id.
-     */
-    GLuint get_program() const
-    {
-        return _program;
-    }
-    /** Use current program.
-     *
-     * @return program id.
-     */
-    void use();
-    /** Unuse current program.
-     *
-     * @return program id.
-     */
-    void unuse();
-    /** Set argument value at index of shader params.
-     *
-     * @param[in] idx   Index in shader params.
-     * @param[in] value Argument value to be set.
-     */
-    template <class T>
-    void set_argument(unsigned int idx, T value)
-    {
-        if(idx >= _shader_arguments.size())
-        {
-            _shader_arguments.resize(idx + 1, 0);
-        }
-
-        unsigned int *p        = reinterpret_cast<unsigned int *>(&value);
-        _shader_arguments[idx] = *p;
-    }
-    /** Clear shader arguments.
-     *
-     */
-    void clear_arguments()
-    {
-        _shader_arguments.clear();
-    }
-    /** Set shader params binding point.
-     *
-     * @param[in] binding Shader params binding point.
-     */
-    void set_shader_params_binding_point(unsigned int binding)
-    {
-        _shader_params_binding_point = binding;
-    }
-    /** Update shader params.
-     *
-     */
-    void update_shader_params();
-    /** Clean up program and ubo.
-     *
-     */
-    void cleanup();
-
-private:
-    std::string                  _name;                                 /**< Kernel name */
-    GLuint                       _program;                              /**< Linked program id */
-    std::vector<unsigned int>    _shader_arguments;                     /**< Store all the values of the shader arguments */
-    GLuint                       _shader_params_ubo_name;               /**< Uniform buffer object name for shader parameters */
-    GLuint                       _shader_params_binding_point;          /**< The binding point of the uniform block for shader parameters */
-    GLuint                       _shader_params_index;                  /**< The index of the uniform block */
-    GLint                        _shader_params_size;                   /**< The uniform block data size in the shader */
-    static constexpr const char *_shader_params_name = "shader_params"; /**< The uniform block name in the shader */
-};
-
-/** GCKernelLibrary class */
-class GCKernelLibrary final
-{
-    using StringSet = std::set<std::string>;
-
-public:
-    /** Default Constructor. */
-    GCKernelLibrary();
-    /** Default Destructor */
-    ~GCKernelLibrary();
-    /** Prevent instances of this class from being copied */
-    GCKernelLibrary(const GCKernelLibrary &) = delete;
-    /** Prevent instances of this class from being copied */
-    const GCKernelLibrary &operator=(const GCKernelLibrary &) = delete;
-    /** Get the static instance of @ref GCKernelLibrary.
-     * This method has been deprecated and will be removed in the next release.
-     * @return The static instance of GCKernelLibrary.
-     */
-    static GCKernelLibrary &get();
-    /** Initialises the kernel library.
-     *
-     * @param[in] shader_path (Optional) Path of the directory from which shader sources are loaded.
-     * @param[in] dpy         (Optional) EGLdisplay set by external application.
-     * @param[in] ctx         (Optional) EGLContext set by external application.
-     */
-    void init(std::string shader_path = "./", EGLDisplay dpy = EGL_NO_DISPLAY, EGLContext ctx = EGL_NO_CONTEXT);
-    /** Sets the path that the shaders reside in.
-     *
-     * @param[in] shader_path Path of the shader.
-     */
-    void set_shader_path(const std::string &shader_path);
-    /** Sets display and context to create kernel.
-     *
-     * @param[in] dpy EGLdisplay set by external application.
-     * @param[in] ctx EGLContext set by external application.
-     */
-    void set_context(EGLDisplay dpy, EGLContext ctx);
-    /** Creates a kernel from the kernel library.
-     *
-     * @param[in] shader_name       Shader name.
-     * @param[in] build_options_set Shader build options as a set.
-     *
-     * @return The created kernel.
-     */
-    GCKernel create_kernel(const std::string &shader_name, const StringSet &build_options_set = {}) const;
-    /** Serializes and saves programs to a binary. */
-    void save_binary();
-    /** Load serialized binary with all the programs. */
-    void load_binary();
-    /** Setup a dummy fbo to workaround an issue on Galaxy S8. */
-    void setup_dummy_fbo();
-
-private:
-    /** Preprocess GLES shader
-     *
-     * @param[in] shader_source Source code of the shader to preprocess.
-     *
-     * @return Preprocessed GLES shader object.
-     */
-    std::string preprocess_shader(const std::string &shader_source) const;
-    /** Load program and its dependencies.
-     *
-     * @param[in] program_name Name of the program to load.
-     */
-    const GCProgram &load_program(const std::string &program_name) const;
-    /** Concatenates contents of a set into a single string.
-     *
-     * @param[in] s Input set to concatenate.
-     *
-     * @return Concatenated string.
-     */
-    std::string stringify_set(const StringSet &s) const;
-
-    EGLDisplay  _display;                                                /**< Underlying EGL Display. */
-    EGLContext  _context;                                                /**< Underlying EGL Context. */
-    GLuint      _frame_buffer;                                           /**< Dummy fbo */
-    GLuint      _tex_rt;                                                 /**< Dummy texture for render target */
-    std::string _shader_path;                                            /**< Path to the shaders folder. */
-    mutable std::map<std::string, const GCProgram>  _programs_map;       /**< Map with all already loaded program data. */
-    mutable std::map<std::string, const GCKernel>   _built_programs_map; /**< Map with all already built program data. */
-    static const std::map<std::string, std::string> _shader_program_map; /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
-                                                                              Used for compile-time shader inclusion. */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCKERNELLIBRARY_H */
diff --git a/arm_compute/core/GLES_COMPUTE/GCKernels.h b/arm_compute/core/GLES_COMPUTE/GCKernels.h
deleted file mode 100644
index a1537ec152..0000000000
--- a/arm_compute/core/GLES_COMPUTE/GCKernels.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCKERNELS_H
-#define ARM_COMPUTE_GCKERNELS_H
-
-/* Header regrouping all the GLES compute kernels */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h"
-
-#endif /* ARM_COMPUTE_GCKERNELS_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h
deleted file mode 100644
index 7b2aad7cec..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCKernel.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCKERNEL_H
-#define ARM_COMPUTE_IGCKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/GPUTarget.h"
-
-#include "arm_compute/core/IKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-class Window;
-
-/** Common interface for all the GLES kernels */
-class IGCKernel : public IKernel
-{
-public:
-    /** Constructor */
-    IGCKernel();
-    /** Returns a reference to the GLES kernel of this object.
-     *
-     * @return A reference to the GLES kernel of this object.
-     */
-    GCKernel &kernel();
-
-    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    void add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    void add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Returns the number of arguments enqueued per 1D tensor object.
-     *
-     * @return The number of arguments enqueues per 1D tensor object.
-     */
-    unsigned int num_arguments_per_1D_tensor() const;
-    /** Returns the number of arguments enqueued per 2D tensor object.
-     *
-     * @return The number of arguments enqueues per 2D tensor object.
-     */
-    unsigned int num_arguments_per_2D_tensor() const;
-    /** Returns the number of arguments enqueued per 3D tensor object.
-     *
-     * @return The number of arguments enqueues per 3D tensor object.
-     */
-    unsigned int num_arguments_per_3D_tensor() const;
-    /** Enqueue the OpenGL ES shader to process the given window
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    virtual void run(const Window &window) = 0;
-
-    /** Set the Local-Workgroup-Size hint
-     *
-     * @note This method should be called after the configuration of the kernel
-     *
-     * @param[in] lws_hint Local-Workgroup-Size to use
-     */
-    void set_lws_hint(gles::NDRange &lws_hint)
-    {
-        _lws_hint = lws_hint;
-    }
-
-    /** Set the targeted GPU architecture
-     *
-     * @param[in] target The targeted GPU architecture
-     */
-    void set_target(GPUTarget target)
-    {
-        _target = target;
-    }
-
-    /** Get the targeted GPU architecture
-     *
-     * @return The targeted GPU architecture.
-     */
-    GPUTarget get_target() const
-    {
-        return _target;
-    }
-
-private:
-    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    template <unsigned int dimension_size>
-    void add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Returns the number of arguments enqueued per tensor object.
-     *
-     * @return The number of arguments enqueued per tensor object.
-     */
-    template <unsigned int dimension_size>
-    unsigned int           num_arguments_per_tensor() const;
-
-protected:
-    GCKernel      _kernel;   /**< GLES kernel to run */
-    gles::NDRange _lws_hint; /**< Local workgroup size hint for the GLES kernel */
-    GPUTarget     _target;   /**< The targeted GPU */
-};
-
-/** Add the kernel to the command queue with the given window.
- *
- * @note Depending on the size of the window, this might translate into several jobs being enqueued.
- *
- * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
- *
- * @param[in] kernel Kernel to enqueue
- * @param[in] window Window the kernel has to process.
- * @param[in] lws    Local workgroup size requested, by default (1, 1, 1)
- *
- * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
- */
-void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws = gles::NDRange(1U, 1U, 1U));
-}
-#endif /*ARM_COMPUTE_IGCKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
deleted file mode 100644
index ae8fd40888..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCSIMPLE2DKERNEL_H
-#define ARM_COMPUTE_IGCSIMPLE2DKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
-class IGCSimple2DKernel : public IGCSimpleKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /*ARM_COMPUTE_IGCSIMPLE2DKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
deleted file mode 100644
index 40a21ee147..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCSIMPLE3DKERNEL_H
-#define ARM_COMPUTE_IGCSIMPLE3DKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for simple GLES kernels having 1 tensor input and 1 tensor output.
- *  Both input tensor and output tensor must have at least 3 dimensions.
- */
-class IGCSimple3DKernel : public IGCSimple2DKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /*ARM_COMPUTE_IGCSIMPLE3DKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
deleted file mode 100644
index c0f561ab5d..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCSIMPLEKERNEL_H
-#define ARM_COMPUTE_IGCSIMPLEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-
-namespace arm_compute
-{
-/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output */
-class IGCSimpleKernel : public IGCKernel
-{
-public:
-    /** Constructor. */
-    IGCSimpleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCSimpleKernel(const IGCSimpleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCSimpleKernel &operator=(const IGCSimpleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    IGCSimpleKernel(IGCSimpleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    IGCSimpleKernel &operator=(IGCSimpleKernel &&) = default;
-    /** Default destructor */
-    ~IGCSimpleKernel() = default;
-
-    /** Configure the kernel
-     *
-     * @param[in]  input                             Source tensor.
-     * @param[out] output                            Destination tensor.
-     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  border_size                       (Optional) Size of the border.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
-
-protected:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-};
-}
-
-#endif /*ARM_COMPUTE_IGCSIMPLEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCTensor.h b/arm_compute/core/GLES_COMPUTE/IGCTensor.h
deleted file mode 100644
index c382095846..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCTensor.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCTENSOR_H
-#define ARM_COMPUTE_IGCTENSOR_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/ITensor.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-/** Interface for GLES Compute tensor */
-class IGCTensor : public ITensor
-{
-public:
-    /** Default constructor. */
-    IGCTensor();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCTensor(const IGCTensor &) = delete;
-
-    /** Prevent instances of this class from being copy assigned (As this class contains pointers) */
-    IGCTensor &operator=(const IGCTensor &) = delete;
-
-    /** Allow instances of this class to be moved */
-    IGCTensor(IGCTensor &&) = default;
-
-    /** Allow instances of this class to be moved */
-    IGCTensor &operator=(IGCTensor &&) = default;
-
-    /** Virtual destructor */
-    virtual ~IGCTensor() = default;
-
-    /** Map on an allocated buffer.
-     *
-     * @param[in] blocking (Optional) If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    void map(bool blocking = true);
-    /** Unmap an allocated and mapped buffer.
-     */
-    void unmap();
-    /** Clear the contents of the tensor synchronously.
-     */
-    void clear();
-
-    // Inherited methods overridden:
-    uint8_t *buffer() const override;
-    /** Interface to be implemented by the child class to return the tensor's gles compute buffer id.
-      *
-      * @return A SSBO buffer id.
-     */
-    virtual GLuint gc_buffer() const = 0;
-
-    /** Flag indicating whether the tensor has been left aligned by a kernel and therefore needs shifting.
-     *
-     * @return True if the tensor is left aligned.
-     */
-    bool needs_shifting() const;
-    /** Set the flag indicating whether or not a tensor needs shifting.
-     *
-     * @param[in] needs_shifting Indicates if the tensor is left aligned or not.
-     *
-     */
-    void set_needs_shifting(bool needs_shifting);
-
-protected:
-    /** Method to be implemented by the child class to map the SSBO.
-     *
-     * @param[in] blocking If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint8_t *do_map(bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the SSBO.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     */
-    virtual void do_unmap() = 0;
-
-private:
-    uint8_t *_mapping;
-    bool     _needs_shifting;
-};
-
-/** Interface for GLES Compute image */
-using IGCImage = IGCTensor;
-}
-#endif /*ARM_COMPUTE_IGCTENSOR_H */
diff --git a/arm_compute/core/GLES_COMPUTE/OpenGLES.h b/arm_compute/core/GLES_COMPUTE/OpenGLES.h
deleted file mode 100644
index 445443602d..0000000000
--- a/arm_compute/core/GLES_COMPUTE/OpenGLES.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_OPENGLES_H
-#define ARM_COMPUTE_OPENGLES_H
-
-#include "arm_compute/core/Log.h"
-
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
-#include <EGL/eglplatform.h>
-#include <GLES3/gl31.h>
-#include <GLES3/gl3ext.h>
-#include <cstddef>
-
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-#define ARM_COMPUTE_GL_CHECK(x)                                                                      \
-    x;                                                                                               \
-    {                                                                                                \
-        GLenum error = glGetError();                                                                 \
-        if(error != GL_NO_ERROR)                                                                     \
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("glGetError() = %i (0x%.8x)\n", error, error); \
-    }
-#else /* ARM_COMPUTE_DEBUG_ENABLED */
-#define ARM_COMPUTE_GL_CHECK(x) x
-#endif /* ARM_COMPUTE_DEBUG_ENABLED */
-
-namespace arm_compute
-{
-namespace gles
-{
-/** Class interface for specifying NDRange values. */
-class NDRange
-{
-private:
-    size_t _sizes[3];
-    size_t _dimensions;
-
-public:
-    /** Default constructor - resulting range has zero dimensions. */
-    NDRange()
-        : _dimensions(0)
-    {
-        _sizes[0] = 0;
-        _sizes[1] = 0;
-        _sizes[2] = 0;
-    }
-
-    /** Constructs one-dimensional range.
-     *
-     * @param[in] size0 Size of the first dimension.
-     */
-    NDRange(size_t size0)
-        : _dimensions(1)
-    {
-        _sizes[0] = size0;
-        _sizes[1] = 1;
-        _sizes[2] = 1;
-    }
-
-    /** Constructs two-dimensional range.
-     *
-     * @param[in] size0 Size of the first dimension.
-     * @param[in] size1 Size of the second dimension.
-     */
-    NDRange(size_t size0, size_t size1)
-        : _dimensions(2)
-    {
-        _sizes[0] = size0;
-        _sizes[1] = size1;
-        _sizes[2] = 1;
-    }
-
-    /** Constructs three-dimensional range.
-     *
-     * @param[in] size0 Size of the first dimension.
-     * @param[in] size1 Size of the second dimension.
-     * @param[in] size2 Size of the third dimension.
-     */
-    NDRange(size_t size0, size_t size1, size_t size2)
-        : _dimensions(3)
-    {
-        _sizes[0] = size0;
-        _sizes[1] = size1;
-        _sizes[2] = size2;
-    }
-
-    /** Conversion operator to const size_t *.
-     *
-     *  @returns A pointer to the size of the first dimension.
-     */
-    operator const size_t *() const
-    {
-        return _sizes;
-    }
-
-    /** Queries the number of dimensions in the range.
-     *
-     * @returns The number of dimensions.
-    */
-    size_t dimensions() const
-    {
-        return _dimensions;
-    }
-
-    /** Returns the size of the object in bytes based on the runtime number of dimensions
-     *
-     * @returns The size of the object in bytes.
-     */
-    size_t size() const
-    {
-        return _dimensions * sizeof(size_t);
-    }
-
-    /** Returns the sizes array for each dimensions.
-     *
-     * @returns The sizes array
-     */
-    size_t *get()
-    {
-        return _sizes;
-    }
-
-    /** Returns the sizes array for each dimensions.
-     *
-     * @returns The sizes array
-     */
-    const size_t *get() const
-    {
-        return _sizes;
-    }
-};
-
-static const NDRange NullRange;
-static const NDRange Range_128_1 = NDRange(128, 1);
-} // namespace gles
-
-/** Check if the OpenGL ES 3.1 API is available at runtime.
- *
- *  @returns true if the OpenGL ES 3.1 API is available.
- */
-bool opengles31_is_available();
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_OPENGLES_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
deleted file mode 100644
index d55f98fa66..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the absolute difference kernel.
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class GCAbsoluteDifferenceKernel : public IGCKernel
-{
-public:
-    /** Default constructor. */
-    GCAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCAbsoluteDifferenceKernel(const GCAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCAbsoluteDifferenceKernel &operator=(const GCAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCAbsoluteDifferenceKernel(GCAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCAbsoluteDifferenceKernel &operator=(GCAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~GCAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8
-     * @param[in]  input2 Source tensor. Data types supported: U8
-     * @param[out] output Destination tensor. Data types supported: U8
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input1; /**< Source tensor 1. */
-    const IGCTensor *_input2; /**< Source tensor 2. */
-    IGCTensor       *_output; /**< Destination tensor. */
-};
-}
-#endif /* ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
deleted file mode 100644
index 65e018a50a..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class IGCTensor;
-class GCCoreRuntimeContext;
-
-/** Interface for the activation layer kernel. */
-class GCActivationLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor
-     *
-     * @param[in, out] ctx Core context to use
-     */
-    explicit GCActivationLayerKernel(GCCoreRuntimeContext *ctx = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCActivationLayerKernel(const GCActivationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCActivationLayerKernel &operator=(const GCActivationLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCActivationLayerKernel(GCActivationLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCActivationLayerKernel &operator=(GCActivationLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCActivationLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: F16/F32.
-     * @param[out]     output   Destination tensor. Data type should match the input data type.
-     * @param[in]      act_info Activation layer information.
-     */
-    void configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    IGCTensor            *_input;
-    IGCTensor            *_output;
-    GCCoreRuntimeContext *_ctx;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h
deleted file mode 100644
index 7e8159c638..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCARITHMETICADDITIONKERNEL_H
-#define ARM_COMPUTE_GCARITHMETICADDITIONKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the arithmetic addition kernel
- *
- * Arithmetic addition is computed by:
- * @f[ output(x,y) = input1(x,y) + input2(x,y) @f]
- */
-class GCArithmeticAdditionKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCArithmeticAdditionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCArithmeticAdditionKernel(const GCArithmeticAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCArithmeticAdditionKernel &operator=(const GCArithmeticAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCArithmeticAdditionKernel(GCArithmeticAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCArithmeticAdditionKernel &operator=(GCArithmeticAdditionKernel &&) = default;
-    /** Default destructor */
-    ~GCArithmeticAdditionKernel() = default;
-    /** Initialise the kernel's inputs, output and convertion policy.
-     *
-     * @param[in]  input1 First tensor input. Data types supported: F16.
-     * @param[in]  input2 Second tensor input. Data types supported: F16.
-     * @param[out] output Output tensor. Data types supported: F16.
-     * @param[in]  policy Policy to use to handle overflow.
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref GCArithmeticAdditionKernel
-     *
-     * @param[in]  input1 First tensor input info. Data types supported: F16.
-     * @param[in]  input2 Second tensor input info. Data types supported: F16.
-     * @param[out] output Output tensor info. Data types supported: F16.
-     * @param[in]  policy Policy to use to handle overflow.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input1; /**< Source tensor 1 */
-    const IGCTensor *_input2; /**< Source tensor 2 */
-    IGCTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCARITHMETICADDITIONKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
deleted file mode 100644
index eb7a99c59e..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the BatchNormalization layer kernel.
- */
-class GCBatchNormalizationLayerKernel : public IGCKernel
-{
-public:
-    /** Constructor */
-    GCBatchNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCBatchNormalizationLayerKernel(const GCBatchNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCBatchNormalizationLayerKernel &operator=(const GCBatchNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    GCBatchNormalizationLayerKernel(GCBatchNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    GCBatchNormalizationLayerKernel &operator=(GCBatchNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCBatchNormalizationLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input    Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                      The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[out] output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]  mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]  gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]  epsilon  (optional) Small value to avoid division with zero.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta = nullptr, const IGCTensor *gamma = nullptr, float epsilon = 0.001f,
-                   ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCBatchNormalizationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
-     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    const IGCTensor *_mean;
-    const IGCTensor *_var;
-    const IGCTensor *_beta;
-    const IGCTensor *_gamma;
-    float            _epsilon;
-};
-}
-#endif /*ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
deleted file mode 100644
index d96fb56771..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCCOL2IMKERNEL_H
-#define ARM_COMPUTE_GCCOL2IMKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref GCIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class GCCol2ImKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCCol2ImKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCCol2ImKernel(const GCCol2ImKernel &) = delete;
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCCol2ImKernel &operator=(const GCCol2ImKernel &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCCol2ImKernel(GCCol2ImKernel &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCCol2ImKernel &operator=(GCCol2ImKernel &&) = default;
-
-    /** Default destructor */
-    ~GCCol2ImKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: F16/F32
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in]  convolved_dims Output convolved dimensions.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-};
-}
-
-#endif /*ARM_COMPUTE_GCCOL2IMKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
deleted file mode 100644
index 9c7754947a..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class GCDepthConcatenateLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthConcatenateLayerKernel(const GCDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthConcatenateLayerKernel &operator=(const GCDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCDepthConcatenateLayerKernel(GCDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCDepthConcatenateLayerKernel &operator=(GCDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    int              _depth_offset;
-};
-}
-#endif /* ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index 8faa54a205..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_GCDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
- */
-class GCDepthwiseConvolutionLayer3x3Kernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDepthwiseConvolutionLayer3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthwiseConvolutionLayer3x3Kernel(const GCDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthwiseConvolutionLayer3x3Kernel &operator=(const GCDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    GCDepthwiseConvolutionLayer3x3Kernel(GCDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    GCDepthwiseConvolutionLayer3x3Kernel &operator=(GCDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input            Source tensor. DataType supported: F16.
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
-     * @param[in]  biases           (Optional) Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     */
-    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize       _border_size;
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    const IGCTensor *_weights;
-    const IGCTensor *_biases;
-    unsigned int     _conv_stride_x;
-    unsigned int     _conv_stride_y;
-    unsigned int     _conv_pad_left;
-    unsigned int     _conv_pad_top;
-    gles::NDRange    _lws;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCDEPTHWISECONVOLUTIONKERNEL3x3_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
deleted file mode 100644
index 43f94f8662..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the direct convolution kernel.
- */
-template <unsigned int kernel_size>
-class GCDirectConvolutionLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDirectConvolutionLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDirectConvolutionLayerKernel(const GCDirectConvolutionLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDirectConvolutionLayerKernel &operator=(const GCDirectConvolutionLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCDirectConvolutionLayerKernel(GCDirectConvolutionLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCDirectConvolutionLayerKernel &operator=(GCDirectConvolutionLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCDirectConvolutionLayerKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input     The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  bias      Biases tensor. Shared bias supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
-     * @param[out] output    The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                       while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_bias;
-    const IGCTensor *_weights;
-    IGCTensor       *_output;
-    BorderSize       _border_size;
-    int              _conv_stride_x;
-    int              _conv_stride_y;
-    int              _conv_pad_x;
-    int              _conv_pad_y;
-    gles::NDRange    _lws;
-};
-
-/** Interface for the 1x1 direct convolution kernel */
-using GCDirectConvolutionLayer1x1Kernel = GCDirectConvolutionLayerKernel<1>;
-/** Interface for the 3x3 direct convolution kernel */
-using GCDirectConvolutionLayer3x3Kernel = GCDirectConvolutionLayerKernel<3>;
-/** Interface for the 5x5 direct convolution kernel */
-using GCDirectConvolutionLayer5x5Kernel = GCDirectConvolutionLayerKernel<5>;
-}
-#endif /*ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
deleted file mode 100644
index e3dda67a8a..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H
-#define ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the dropout layer kernel.
- *
- * Dropout is used to improve over-fit on neural networks.
- *
- */
-class GCDropoutLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDropoutLayerKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDropoutLayerKernel(const GCDropoutLayerKernel &) = delete;
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDropoutLayerKernel &operator=(const GCDropoutLayerKernel &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCDropoutLayerKernel(GCDropoutLayerKernel &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCDropoutLayerKernel &operator=(GCDropoutLayerKernel &&) = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input   The input tensor for this op. Data types supported: F16/F32
-     * @param[out] mask    The mask tensor. Data types supported: Same as @p input
-     * @param[out] output  The output tensor. Data types supported: Same as @p input
-     * @param[in]  ratio   Dropout ratio
-     * @param[in]  forward Forward or backward propagation
-     *
-     */
-    void configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_mask;
-    IGCTensor       *_output;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-}
-
-#endif /*ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
deleted file mode 100644
index 4dd7aa0ec1..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCFILLBORDERKERNEL_H
-#define ARM_COMPUTE_GCFILLBORDERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for filling the border of a kernel */
-class GCFillBorderKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCFillBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCFillBorderKernel(const GCFillBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCFillBorderKernel &operator=(const GCFillBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCFillBorderKernel(GCFillBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCFillBorderKernel &operator=(GCFillBorderKernel &&) = default;
-    /** Default destructor */
-    ~GCFillBorderKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in,out] tensor                Tensor to process Data types supported: F16/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-
-    /** Function to set the constant value on fill border kernel depending on type.
-     *
-     * @param[in] idx                   Index of the kernel argument to set.
-     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    template <class T>
-    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    bool is_parallelisable() const override;
-
-private:
-    const IGCTensor *_tensor;
-};
-}
-#endif /*ARM_COMPUTE_GCFILLBORDERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
deleted file mode 100644
index cbc60da443..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H
-#define ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGL ES kernel which interleaves the elements of a matrix A in chunk of 4x4
- *
- * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
- */
-class GCGEMMInterleave4x4Kernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMInterleave4x4Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMInterleave4x4Kernel(const GCGEMMInterleave4x4Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMInterleave4x4Kernel &operator=(const GCGEMMInterleave4x4Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCGEMMInterleave4x4Kernel(GCGEMMInterleave4x4Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCGEMMInterleave4x4Kernel &operator=(GCGEMMInterleave4x4Kernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16, F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-
-    // Inherited methods overridden
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-};
-}
-#endif /* ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index 95f991ee73..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-/** Interface to add a bias to each row of the input tensor
- *
- */
-class GCGEMMMatrixAccumulateBiasesKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAccumulateBiasesKernel(const GCGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAccumulateBiasesKernel &operator=(const GCGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAccumulateBiasesKernel(GCGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAccumulateBiasesKernel &operator=(GCGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(IGCTensor *accum, const IGCTensor *biases);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    IGCTensor       *_accum;
-    const IGCTensor *_biases;
-    gles::NDRange    _lws;
-};
-}
-
-#endif /*ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
deleted file mode 100644
index e4157a1327..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H
-#define ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGL ES kernel to perform the in-place matrix addition between 2 matrices, taking into account that the second matrix might be weighted by a scalar value beta.
- *  The matrices must have the same dimensions
- *
- * @note This kernel is computed if and only if beta != 0.0.
- */
-class GCGEMMMatrixAdditionKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMMatrixAdditionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAdditionKernel(const GCGEMMMatrixAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAdditionKernel &operator=(const GCGEMMMatrixAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAdditionKernel(GCGEMMMatrixAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAdditionKernel &operator=(GCGEMMMatrixAdditionKernel &&) = default;
-    /** Initialise the kernel's input, output and beta value
-     *
-     * @note The input and output tensors must have the same dimensions
-     *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: F32
-     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref GCGEMMMatrixMultiplyKernel. Data type supported: same as @p input
-     * @param[in]      beta   Weight of matrix C
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, float beta);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-};
-}
-
-#endif /* ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 4dcae2e536..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GPUTarget.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** GLES Compute kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha
- *
- * @attention The second input tensor must have at least 2 dimensions (matrix)
- *
- */
-class GCGEMMMatrixMultiplyKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMMatrixMultiplyKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixMultiplyKernel(const GCGEMMMatrixMultiplyKernel &) = delete;
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixMultiplyKernel &operator=(const GCGEMMMatrixMultiplyKernel &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixMultiplyKernel(GCGEMMMatrixMultiplyKernel &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixMultiplyKernel &operator=(GCGEMMMatrixMultiplyKernel &&) = default;
-
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  input0                    Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                                       If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref GCGEMMInterleave4x4Kernel and @ref GCGEMMTranspose1xWKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     */
-    void configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in] input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha                     Weight of the matrix product
-     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref GCGEMMInterleave4x4Kernel and @ref GCGEMMTranspose1xWKernel
-     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in] gpu_target                GPU Target
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
-                           GPUTarget gpu_target);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input0;
-    const IGCTensor *_input1;
-    IGCTensor       *_output;
-};
-}
-#endif /* ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
deleted file mode 100644
index 29a4c8d209..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H
-#define ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGLES kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
- *
- * Following an example of how the transposition1xW works when the input data type is F32
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
- *
- */
-class GCGEMMTranspose1xWKernel : public IGCSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16, F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /* ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
deleted file mode 100644
index 7d1a53c4c3..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCIM2COLKERNEL_H
-#define ARM_COMPUTE_GCIM2COLKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-class Size2D;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class GCIm2ColKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCIm2ColKernel(const GCIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCIm2ColKernel &operator=(const GCIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCIm2ColKernel(GCIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCIm2ColKernel &operator=(GCIm2ColKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32
-     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                         while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32
-     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                        while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U));
-
-private:
-    /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input)
-     *
-     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     * @param[in,out] queue  Command queue on which to enqueue the kernel.
-     */
-    void run_reduced(const Window &window);
-    /** run the generic convolution layer input reshape kernel
-     *
-     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     * @param[in,out] queue  Command queue on which to enqueue the kernel.
-     */
-    void run_generic(const Window &window);
-
-    /** Common signature for the kernel to run */
-    using Im2ColFunction = void (GCIm2ColKernel::*)(const Window &);
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    std::pair<unsigned int, unsigned int> _kernel_dims;
-    unsigned int   _num_elems_processed_per_iteration;
-    Im2ColFunction _run_func;
-};
-}
-
-#endif /*ARM_COMPUTE_GCIM2COLKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
deleted file mode 100644
index dd00caecfb..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the normalization layer kernel.
- */
-class GCNormalizationLayerKernel : public IGCKernel
-{
-public:
-    /** Constructor */
-    GCNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizationLayerKernel(const GCNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizationLayerKernel &operator=(const GCNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    GCNormalizationLayerKernel(GCNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    GCNormalizationLayerKernel &operator=(GCNormalizationLayerKernel &&) = default;
-    /** Default destrutor */
-    ~GCNormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: F32.
-     * @param[in]  squared_input Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           Data types should match the input type.
-     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data types should match the input type.
-     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_squared_input;
-    IGCTensor       *_output;
-    BorderSize       _border_size;
-};
-}
-#endif /*ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h
deleted file mode 100644
index 5156da8b2c..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTEH
-#define ARM_COMPUTE_GCNORMALIZEPLANARYUVLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the NormalizePlanarYUV layer kernel.
- */
-class GCNormalizePlanarYUVLayerKernel : public IGCKernel
-{
-public:
-    /** Constructor */
-    GCNormalizePlanarYUVLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizePlanarYUVLayerKernel(const GCNormalizePlanarYUVLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizePlanarYUVLayerKernel &operator=(const GCNormalizePlanarYUVLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    GCNormalizePlanarYUVLayerKernel(GCNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default move assignment operator */
-    GCNormalizePlanarYUVLayerKernel &operator=(GCNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCNormalizePlanarYUVLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: F16.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the feature maps [FM].
-     *                    Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std);
-    /** Static function to check if given info will lead to a valid configuration of @ref GCNormalizePlanarYUVLayerKernel
-     *
-     * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: F16.
-     * @param[out] output Destination tensor info. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    const IGCTensor *_mean;
-    const IGCTensor *_std;
-};
-}
-#endif /*ARM_COMPUTE_GCNORMALIZEPLANARYUVLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 0c4b656175..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the pixelwise multiplication kernel.
- *
- */
-class GCPixelWiseMultiplicationKernel : public IGCKernel
-{
-public:
-    /** Default constructor.*/
-    GCPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPixelWiseMultiplicationKernel(const GCPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPixelWiseMultiplicationKernel &operator=(const GCPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCPixelWiseMultiplicationKernel(GCPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCPixelWiseMultiplicationKernel &operator=(GCPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1 An input tensor. Data types supported: F32.
-     * @param[in]  input2 An input tensor. Data types supported: same as @p input1.
-     * @param[out] output The output tensor, Data types supported: same as @p input1.
-     * @param[in]  scale  Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input1;
-    const IGCTensor *_input2;
-    IGCTensor       *_output;
-};
-}
-
-#endif /*ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
deleted file mode 100644
index 7a2fb84f34..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the pooling layer kernel */
-class GCPoolingLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPoolingLayerKernel(const GCPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPoolingLayerKernel &operator=(const GCPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCPoolingLayerKernel(GCPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCPoolingLayerKernel &operator=(GCPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: F16/F32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices = nullptr);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref GCPoolingLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: F16/F32.
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    IGCTensor       *_indices;
-    PoolingLayerInfo _pool_info;
-    BorderSize       _border_size;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
deleted file mode 100644
index 754f15cbd8..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCSCALEKERNEL_H
-#define ARM_COMPUTE_GCSCALEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the scale kernel */
-class GCScaleKernel : public IGCSimple3DKernel
-{
-public:
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  input            Source tensor. Data types supported: F16
-     * @param[out] output           Destination tensor. Data types supported: Same as @p input
-     *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy           Interpolation type to use
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  sampling_policy  (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCSCALEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
deleted file mode 100644
index 280efe11f8..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the identifying the max value of 1D Logits */
-class GCLogits1DMaxKernel : public IGCSimple3DKernel
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-};
-
-/** Interface for shifting the logits values around the max value and exponentiating the result */
-class GCLogits1DShiftExpSumKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCLogits1DShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DShiftExpSumKernel(const GCLogits1DShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DShiftExpSumKernel &operator=(const GCLogits1DShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCLogits1DShiftExpSumKernel(GCLogits1DShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCLogits1DShiftExpSumKernel &operator=(GCLogits1DShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_max;
-    IGCTensor       *_output;
-    IGCTensor       *_sum;
-};
-
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class GCLogits1DNormKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCLogits1DNormKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DNormKernel(const GCLogits1DNormKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DNormKernel &operator=(const GCLogits1DNormKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCLogits1DNormKernel(GCLogits1DNormKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCLogits1DNormKernel &operator=(GCLogits1DNormKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_sum;
-    IGCTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
deleted file mode 100644
index 5243e54daf..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCTENSORSHIFTKERNEL_H
-#define ARM_COMPUTE_GCTENSORSHIFTKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-/** Interface for the kernel to shift valid data on a tensor.
- *
- * For example shifting 3x3 valid data with padding of 1 to right:
- * @f[
- * \left( \begin{array}{ccccc}
- * 0   & 0   & 0   & 0 & 0 \\
- * a00 & a01 & a02 & 0 & 0 \\
- * a10 & a11 & a12 & 0 & 0 \\
- * a20 & a21 & a22 & 0 & 0 \\
- * 0   & 0   & 0   & 0 & 0 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccc}
- * 0  & 0   & 0   & 0   & 0 \\
- * 0  & a00 & a01 & a02 & 0 \\
- * 0  & a10 & a11 & a12 & 0 \\
- * 0  & a20 & a21 & a22 & 0 \\
- * 0  & 0   & 0   & 0   & 0 \\
- * \end{array} \right)
- * @f]
- */
-class GCTensorShiftKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCTensorShiftKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCTensorShiftKernel(const GCTensorShiftKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCTensorShiftKernel &operator=(const GCTensorShiftKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCTensorShiftKernel(GCTensorShiftKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCTensorShiftKernel &operator=(GCTensorShiftKernel &&) = default;
-    /** Default destructor */
-    ~GCTensorShiftKernel() = default;
-    /** Set the input of the kernel.
-     *
-     * @param[in,out] input Source tensor. Data types supported: F16/F32
-     */
-    void configure(IGCTensor *input);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    IGCTensor    *_input;
-    gles::NDRange _lws;
-    int           _left_padding;
-};
-}
-#endif /*ARM_COMPUTE_GCTENSORSHIFTKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
deleted file mode 100644
index a981ae6d1f..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCTRANSPOSEKERNEL_H
-#define ARM_COMPUTE_GCTRANSPOSEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGL ES kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class GCTransposeKernel : public IGCSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /* ARM_COMPUTE_GCTRANSPOSEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h
deleted file mode 100644
index 134346b8da..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-/** GLES Compute kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref GCIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class GCWeightsReshapeKernel : public IGCKernel
-{
-public:
-    /** Constructor.*/
-    GCWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCWeightsReshapeKernel(const GCWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCWeightsReshapeKernel &operator=(const GCWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCWeightsReshapeKernel(GCWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCWeightsReshapeKernel &operator=(GCWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~GCWeightsReshapeKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  batches] if unshared. Data types supported: F16, F32
-     * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                    dimensions [OFM, batches] if unshared. Data types supported: Same as @p input
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
-     */
-    void configure(const IGCTensor *input, const IGCTensor *biases, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_biases;
-    IGCTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h
index 4959ee5e8a..b107a52d9f 100644
--- a/arm_compute/core/GPUTarget.h
+++ b/arm_compute/core/GPUTarget.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GPUTARGET_H
-#define ARM_COMPUTE_GPUTARGET_H
+#ifndef ACL_ARM_COMPUTE_CORE_GPUTARGET_H
+#define ACL_ARM_COMPUTE_CORE_GPUTARGET_H
 
-#include "arm_compute/core/Helpers.h"
+#include "support/Traits.h"
 
 #include <string>
 
@@ -33,25 +33,38 @@ namespace arm_compute
 /** Available GPU Targets */
 enum class GPUTarget
 {
-    UNKNOWN       = 0x101,
-    GPU_ARCH_MASK = 0xF00,
-    MIDGARD       = 0x100,
-    BIFROST       = 0x200,
-    VALHALL       = 0x300,
-    T600          = 0x110,
-    T700          = 0x120,
-    T800          = 0x130,
-    G71           = 0x210,
-    G72           = 0x220,
-    G51           = 0x230,
-    G51BIG        = 0x231,
-    G51LIT        = 0x232,
-    G52           = 0x240,
-    G52LIT        = 0x241,
-    G76           = 0x250,
-    G77           = 0x310,
-    TBOX          = 0x320,
-    TODX          = 0x330,
+    UNKNOWN             = 0x101,
+    GPU_ARCH_MASK       = 0xF00,
+    GPU_GENERATION_MASK = 0x0F0,
+    MIDGARD             = 0x100,
+    BIFROST             = 0x200,
+    VALHALL             = 0x300,
+    FIFTHGEN            = 0X400,
+    T600                = 0x110,
+    T700                = 0x120,
+    T800                = 0x130,
+    G71                 = 0x210,
+    G72                 = 0x220,
+    G51                 = 0x221,
+    G51BIG              = 0x222,
+    G51LIT              = 0x223,
+    G31                 = 0x224,
+    G76                 = 0x230,
+    G52                 = 0x231,
+    G52LIT              = 0x232,
+    G77                 = 0x310,
+    G57                 = 0x311,
+    G78                 = 0x320,
+    G68                 = 0x321,
+    G78AE               = 0x330,
+    G710                = 0x340,
+    G610                = 0x341,
+    G510                = 0x342,
+    G310                = 0x343,
+    G715                = 0x350,
+    G615                = 0x351,
+    G720                = 0x410,
+    G620                = 0X411
 };
 
 /** Enable bitwise operations on GPUTarget enumerations */
@@ -104,4 +117,4 @@ inline bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target)
     return target_to_check == target;
 }
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GPUTARGET_H */
+#endif // ACL_ARM_COMPUTE_CORE_GPUTARGET_H
diff --git a/arm_compute/core/HOGInfo.h b/arm_compute/core/HOGInfo.h
deleted file mode 100644
index 3cc472b274..0000000000
--- a/arm_compute/core/HOGInfo.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HOGINFO_H
-#define ARM_COMPUTE_HOGINFO_H
-
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Store the HOG's metadata */
-class HOGInfo
-{
-public:
-    /** Default constructor */
-    HOGInfo();
-    /** Default destructor */
-    virtual ~HOGInfo() = default;
-    /** Allow instances of this class to be copy constructed */
-    HOGInfo(const HOGInfo &) = default;
-    /** Allow instances of this class to be copied */
-    HOGInfo &operator=(const HOGInfo &) = default;
-    /** Allow instances of this class to be move constructed */
-    HOGInfo(HOGInfo &&) = default;
-    /** Allow instances of this class to be moved */
-    HOGInfo &operator=(HOGInfo &&) = default;
-    /** Constructor
-     *
-     * @param[in] cell_size             Cell size in pixels
-     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
-     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
-     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
-     * @param[in] num_bins              Number of histogram bins for each cell
-     * @param[in] normalization_type    (Optional) Normalization type to use for each block
-     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
-     * @param[in] phase_type            (Optional) Type of @ref PhaseType
-     */
-    HOGInfo(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
-            HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
-    /** Initialize the metadata structure with the given parameters
-     *
-     * @param[in] cell_size             Cell size in pixels
-     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
-     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
-     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
-     * @param[in] num_bins              Number of histogram bins for each cell
-     * @param[in] normalization_type    (Optional) Normalization type to use for each block
-     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
-     * @param[in] phase_type            (Optional) Type of @ref PhaseType
-     */
-    void init(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
-              HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
-    /** The cell size in pixels
-     *
-     * @return The cell size in pixels
-     */
-    const Size2D &cell_size() const;
-    /** The block size in pixels
-     *
-     * @return The block size in pixels
-     */
-    const Size2D &block_size() const;
-    /** The detection window size in pixels
-     *
-     * @return The detection window size in pixels
-     */
-    const Size2D &detection_window_size() const;
-    /** The block stride in pixels. The block stride is the distance between 2 consecutive blocks
-     *
-     * @return The block stride in pixels
-     */
-    const Size2D &block_stride() const;
-    /** The number of histogram bins for each cell
-     *
-     * @return The number of histogram bins for each cell
-     */
-    size_t num_bins() const;
-    /** The normalization type
-     *
-     * @return The normalization type
-     */
-    HOGNormType normalization_type() const;
-    /** Threshold used for L2HYS_NORM normalization type
-     *
-     * @return Threshold used for L2HYS_NORM normalization type
-     */
-    float l2_hyst_threshold() const;
-    /** The type of @ref PhaseType
-     *
-     * @return The type of @ref PhaseType
-     */
-    PhaseType phase_type() const;
-    /** The size of HOG descriptor
-     *
-     * @return The size of HOG descriptor
-     */
-    size_t descriptor_size() const;
-    /** Calculates the number of cells for each block
-     *
-     * @return The Size2D data object which stores the number of cells along the x and y directions
-     */
-    Size2D num_cells_per_block() const;
-
-    /** Calculates the number of cells per block stride
-     *
-     * @return The Size2D data object which stores the number of cells per block stride along the x and y directions
-     */
-    Size2D num_cells_per_block_stride() const;
-    /** Calculates the number of block positions for the given image size
-     *
-     * @param[in] image_size The input image size data object
-     *
-     * @return The Size2D data object which stores the number of block positions along the x and y directions
-     */
-    Size2D num_block_positions_per_image(const Size2D &image_size) const;
-
-private:
-    Size2D      _cell_size;
-    Size2D      _block_size;
-    Size2D      _detection_window_size;
-    Size2D      _block_stride;
-    size_t      _num_bins;
-    HOGNormType _normalization_type;
-    float       _l2_hyst_threshold;
-    PhaseType   _phase_type;
-    size_t      _descriptor_size;
-};
-}
-#endif /*ARM_COMPUTE_HOGINFO_H */
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 09c672ecfa..960201510a 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,17 @@
 #ifndef ARM_COMPUTE_HELPERS_H
 #define ARM_COMPUTE_HELPERS_H
 
-#include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Steps.h"
-#include "arm_compute/core/Strides.h"
-#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "support/MemorySupport.h"
 
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <tuple>
-#include <type_traits>
-#include <utility>
 
 namespace arm_compute
 {
@@ -48,307 +42,6 @@ class IKernel;
 class ITensor;
 class ITensorInfo;
 
-/** Disable bitwise operations by default */
-template <typename T>
-struct enable_bitwise_ops
-{
-    static constexpr bool value = false; /**< Disabled */
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-template <typename T>
-typename std::enable_if<enable_bitwise_ops<T>::value, T>::type operator&(T lhs, T rhs)
-{
-    using underlying_type = typename std::underlying_type<T>::type;
-    return static_cast<T>(static_cast<underlying_type>(lhs) & static_cast<underlying_type>(rhs));
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-/** Helper function to create and return a unique_ptr pointed to a CL/GLES kernel object
- *  It also calls the kernel's configuration.
- *
- * @param[in] args All the arguments that need pass to kernel's configuration.
- *
- * @return A unique pointer pointed to a CL/GLES kernel object
- */
-template <typename Kernel, typename... T>
-std::unique_ptr<Kernel> create_configure_kernel(T &&... args)
-{
-    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
-    k->configure(std::forward<T>(args)...);
-    return k;
-}
-
-/** Helper function to create and return a unique_ptr pointed to a CL/GLES kernel object
- *
- * @return A unique pointer pointed to a Kernel kernel object
- */
-template <typename Kernel>
-std::unique_ptr<Kernel> create_kernel()
-{
-    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
-    return k;
-}
-
-namespace traits
-{
-/** Check if a type T is contained in a tuple Tuple of types */
-template <typename T, typename Tuple>
-struct is_contained;
-
-template <typename T>
-struct is_contained<T, std::tuple<>> : std::false_type
-{
-};
-
-template <typename T, typename... Ts>
-struct is_contained<T, std::tuple<T, Ts...>> : std::true_type
-{
-};
-
-template <typename T, typename U, typename... Ts>
-struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>>
-{
-};
-}
-
-/** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-template <typename T>
-inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const T a00 = *pixel_ptr;
-    const T a01 = *(pixel_ptr + 1);
-    const T a10 = *(pixel_ptr + stride);
-    const T a11 = *(pixel_ptr + stride + 1);
-
-    const float w1 = dx1 * dy1;
-    const float w2 = dx * dy1;
-    const float w3 = dx1 * dy;
-    const float w4 = dx * dy;
-
-    return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
-}
-
-/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8 and in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- * @param[in] iq_info   Input QuantizationInfo
- * @param[in] oq_info   Output QuantizationInfo
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy, UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = dequantize_qasymm8(*pixel_ptr, iq_info);
-    const float a01 = dequantize_qasymm8(*(pixel_ptr + 1), iq_info);
-    const float a10 = dequantize_qasymm8(*(pixel_ptr + stride), iq_info);
-    const float a11 = dequantize_qasymm8(*(pixel_ptr + stride + 1), iq_info);
-
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-    return static_cast<uint8_t>(quantize_qasymm8(res, oq_info));
-}
-
-/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8_SIGNED and in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- * @param[in] iq_info   Input QuantizationInfo
- * @param[in] oq_info   Output QuantizationInfo
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy, UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = dequantize_qasymm8_signed(*pixel_ptr, iq_info);
-    const float a01 = dequantize_qasymm8_signed(*(pixel_ptr + 1), iq_info);
-    const float a10 = dequantize_qasymm8_signed(*(pixel_ptr + stride), iq_info);
-    const float a11 = dequantize_qasymm8_signed(*(pixel_ptr + stride + 1), iq_info);
-
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-    return static_cast<int8_t>(quantize_qasymm8_signed(res, oq_info));
-}
-
-/** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom pixel value
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- *
- * @note dy must be in the range [0, 1.0]
- *
- * @return The linear interpolated pixel value
- */
-template <typename T>
-inline T delta_linear_c1_y(const T *pixel_ptr, size_t stride, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dy1 = 1.0f - dy;
-
-    const T a00 = *pixel_ptr;
-    const T a10 = *(pixel_ptr + stride);
-
-    const float w1 = dy1;
-    const float w3 = dy;
-
-    return static_cast<T>(a00 * w1 + a10 * w3);
-}
-/** Computes linear interpolation using the pointer to the left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the left pixel value of a single channel input.
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- *
- * @note dx must be in the range [0, 1.0]
- *
- * @return The linear interpolated pixel value
- */
-template <typename T>
-inline T delta_linear_c1_x(const T *pixel_ptr, float dx)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const T a00 = *pixel_ptr;
-    const T a01 = *(pixel_ptr + 1);
-
-    const float dx1 = 1.0f - dx;
-
-    const float w1 = dx1;
-    const float w2 = dx;
-
-    return static_cast<T>(a00 * w1 + a01 * w2);
-}
-/** Return the pixel at (x,y) using bilinear interpolation.
- *
- * @warning Only works if the iterator was created with an IImage
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel input.
- * @param[in] stride          Stride in bytes of the image;
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using bilinear interpolation.
- */
-template <typename T>
-inline T pixel_bilinear_c1(const T *first_pixel_ptr, size_t stride, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    const int32_t xi = std::floor(x);
-    const int32_t yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    return delta_bilinear_c1(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
-}
-
-/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel input
- *
- * @warning Only works if the iterator was created with an IImage
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel image.
- * @param[in] stride          Stride in bytes of the image
- * @param[in] width           Width of the image
- * @param[in] height          Height of the image
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using bilinear interpolation.
- */
-template <typename T>
-inline uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const float xi = std::floor(x);
-    const float yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    if(dx == 0.0f)
-    {
-        if(dy == 0.0f)
-        {
-            return static_cast<T>(first_pixel_ptr[static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride]);
-        }
-        return delta_linear_c1_y(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dy);
-    }
-    if(dy == 0.0f)
-    {
-        return delta_linear_c1_x(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, dx);
-    }
-    return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
-}
-
-/** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
- *
- * @note The interpolation area depends on the width and height ration of the input and output images
- * @note Currently average of the contributing pixels is calculated
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
- * @param[in] stride          Stride in bytes of the image
- * @param[in] width           Width of the image
- * @param[in] height          Height of the image
- * @param[in] wr              Width ratio among the input image width and output image width.
- * @param[in] hr              Height ratio among the input image height and output image height.
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using area interpolation.
- */
-inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y);
-
 /** Iterator updated by @ref execute_window_loop for each window element */
 class Iterator
 {
@@ -362,6 +55,16 @@ public:
      */
     Iterator(const ITensor *tensor, const Window &window);
 
+    /** Create a container iterator for the tensor with the specified number of dimensions, stride, buffer pointer and window.
+     *
+     * @param[in] num_dims The number of dimensions.
+     * @param[in] strides  The strides in bytes.
+     * @param[in] buffer   The data buffer.
+     * @param[in] offset   The offset in bytes from the beginning of the buffer to the first element of the tensor.
+     * @param[in] window   The window which will be used to iterate over the tensor.
+     */
+    Iterator(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &window);
+
     /** Increment the iterator along the specified dimension of the step value associated to the dimension.
      *
      * @warning It is the caller's responsibility to call increment(dimension+1) when reaching the end of a dimension, the iterator will not check for overflow.
@@ -376,7 +79,7 @@ public:
      *
      * @return The current position of the iterator in bytes relative to the first element.
      */
-    constexpr int offset() const;
+    constexpr size_t offset() const;
 
     /** Return a pointer to the current pixel.
      *
@@ -393,18 +96,27 @@ public:
     void reset(size_t dimension);
 
 private:
+    /** Initialize a container iterator for the tensor with the specified number of dimensions, stride, buffer pointer and window.
+     *
+     * @param[in] num_dims The number of dimensions.
+     * @param[in] strides  The strides in bytes.
+     * @param[in] buffer   The data buffer.
+     * @param[in] offset   The offset in bytes from the beginning of the buffer to the first element of the tensor.
+     * @param[in] window   The window which will be used to iterate over the tensor.
+     */
+    void initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &window);
+
     uint8_t *_ptr;
 
     class Dimension
     {
     public:
-        constexpr Dimension()
-            : _dim_start(0), _stride(0)
+        constexpr Dimension() : _dim_start(0), _stride(0)
         {
         }
 
-        int _dim_start;
-        int _stride;
+        size_t _dim_start;
+        size_t _stride;
     };
 
     std::array<Dimension, Coordinates::num_max_dimensions> _dims;
@@ -419,180 +131,7 @@ private:
  * @param[in,out] iterators       Tensor iterators which will be updated by this function before calling lambda_function.
  */
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators);
-
-/** Update window and padding size for each of the access patterns.
- *
- * First the window size is reduced based on all access patterns that are not
- * allowed to modify the padding of the underlying tensor. Then the padding of
- * the remaining tensors is increased to match the window.
- *
- * @param[in] win      Window that is used by the kernel.
- * @param[in] patterns Access patterns used to calculate the final window and padding.
- *
- * @return True if the window has been changed. Changes to the padding do not
- *         influence the returned value.
- */
-template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
-{
-    bool window_changed = false;
-
-    utility::for_each([&](const IAccessWindow & w)
-    {
-        window_changed |= w.update_window_if_needed(win);
-    },
-    patterns...);
-
-    bool padding_changed = false;
-
-    utility::for_each([&](IAccessWindow & w)
-    {
-        padding_changed |= w.update_padding_if_needed(win);
-    },
-    patterns...);
-
-    return window_changed;
-}
-
-/** Calculate the maximum window for a given tensor shape and border setting
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] skip_border  (Optional) If true exclude the border region from the window.
- * @param[in] border_size  (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window for a given tensor shape and border setting
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] skip_border (Optional) If true exclude the border region from the window.
- * @param[in] border_size (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
-{
-    return calculate_max_window(info.valid_region(), steps, skip_border, border_size);
-}
-
-/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] skip_border  (Optional) If true exclude the border region from the window.
- * @param[in] border_size  (Optional) Border size. The border region will be excluded from the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] skip_border (Optional) If true exclude the border region from the window.
- * @param[in] border_size (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
-{
-    return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
-}
-
-/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] border_size  (Optional) Border size. The border region will be included in the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] border_size (Optional) Border size. The border region will be included in the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
-{
-    return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
-}
-
-/** Intersect multiple valid regions.
- *
- * @param[in] regions Valid regions.
- *
- * @return Intersection of all regions.
- */
-template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
-{
-    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
-    {
-        ValidRegion region;
-
-        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
-        {
-            region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
-        }
-
-        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
-        {
-            region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
-        }
-
-        return region;
-    };
-
-    return utility::foldl(intersect, regions...);
-}
-
-/** Create a strides object based on the provided strides and the tensor dimensions.
- *
- * @param[in] info          Tensor info object providing the shape of the tensor for unspecified strides.
- * @param[in] stride_x      Stride to be used in X dimension (in bytes).
- * @param[in] fixed_strides Strides to be used in higher dimensions starting at Y (in bytes).
- *
- * @return Strides object based on the specified strides. Missing strides are
- *         calculated based on the tensor shape and the strides of lower dimensions.
- */
-template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
-{
-    const TensorShape &shape = info.tensor_shape();
-
-    // Create strides object
-    Strides strides(stride_x, fixed_strides...);
-
-    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
-    {
-        strides.set(i, shape[i - 1] * strides[i - 1]);
-    }
-
-    return strides;
-}
-
-/** Create a strides object based on the tensor dimensions.
- *
- * @param[in] info Tensor info object used to compute the strides.
- *
- * @return Strides object based on element size and tensor shape.
- */
-template <typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info)
-{
-    return compute_strides(info, info.element_size());
-}
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators);
 
 /** Permutes given Dimensions according to a permutation vector
  *
@@ -605,7 +144,7 @@ template <typename T>
 inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     auto dimensions_copy = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = (perm[i] < dimensions.num_dimensions()) ? dimensions_copy[perm[i]] : 0;
         dimensions.set(i, dimension_val);
@@ -622,86 +161,13 @@ inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 inline void permute(TensorShape &shape, const PermutationVector &perm)
 {
     TensorShape shape_copy = shape;
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         size_t dimension_val = (perm[i] < shape.num_dimensions()) ? shape_copy[perm[i]] : 1;
-        shape.set(i, dimension_val, false); // Avoid changes in _num_dimension
+        shape.set(i, dimension_val, false, false); // Avoid changes in _num_dimension
     }
 }
 
-/** Auto initialize the tensor info (shape, number of channels and data type) if the current assignment is empty.
- *
- * @param[in,out] info              Tensor info used to check and assign.
- * @param[in]     shape             New shape.
- * @param[in]     num_channels      New number of channels.
- * @param[in]     data_type         New data type
- * @param[in]     quantization_info (Optional) New quantization info
- *
- * @return True if the tensor info has been initialized
- */
-bool auto_init_if_empty(ITensorInfo       &info,
-                        const TensorShape &shape,
-                        int num_channels, DataType data_type,
-                        QuantizationInfo quantization_info = QuantizationInfo());
-
-/** Auto initialize the tensor info using another tensor info.
- *
- * @param info_sink   Tensor info used to check and assign
- * @param info_source Tensor info used to assign
- *
- * @return True if the tensor info has been initialized
- */
-bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source);
-
-/** Set the shape to the specified value if the current assignment is empty.
- *
- * @param[in,out] info  Tensor info used to check and assign.
- * @param[in]     shape New shape.
- *
- * @return True if the shape has been changed.
- */
-bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape);
-
-/** Set the format, data type and number of channels to the specified value if
- * the current data type is unknown.
- *
- * @param[in,out] info   Tensor info used to check and assign.
- * @param[in]     format New format.
- *
- * @return True if the format has been changed.
- */
-bool set_format_if_unknown(ITensorInfo &info, Format format);
-
-/** Set the data type and number of channels to the specified value if
- * the current data type is unknown.
- *
- * @param[in,out] info      Tensor info used to check and assign.
- * @param[in]     data_type New data type.
- *
- * @return True if the data type has been changed.
- */
-bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
-
-/** Set the data layout to the specified value if
- * the current data layout is unknown.
- *
- * @param[in,out] info        Tensor info used to check and assign.
- * @param[in]     data_layout New data layout.
- *
- * @return True if the data type has been changed.
- */
-bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout);
-
-/** Set the quantization info to the specified value if
- * the current quantization info is empty and the data type of asymmetric quantized type
- *
- * @param[in,out] info              Tensor info used to check and assign.
- * @param[in]     quantization_info Quantization info
- *
- * @return True if the quantization info has been changed.
- */
-bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info);
-
 /** Helper function to calculate the Valid Region for Scale.
  *
  * @param[in] src_info           Input tensor info used to check.
@@ -712,8 +178,11 @@ bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantiza
  *
  * @return The corresponding valid region
  */
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined);
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined);
 
 /** Convert a linear index into n-dimensional coordinates.
  *
@@ -733,6 +202,22 @@ inline Coordinates index2coords(const TensorShape &shape, int index);
  */
 inline int coords2index(const TensorShape &shape, const Coordinates &coord);
 
+/** Returns a static map used to find an index or dimension based on a data layout
+  *
+  * *** Layouts ***
+  *
+  * *** 4D ***
+  * [N C H W]
+  * [3 2 1 0]
+  * [N H W C]
+  *
+  * * *** 5D ***
+  * [N C D H W]
+  * [4 3 2 1 0]
+  * [N D H W C]
+  */
+const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map();
+
 /** Get the index of the given dimension.
  *
  * @param[in] data_layout           The data layout.
@@ -740,7 +225,8 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord);
  *
  * @return The int conversion of the requested data layout index.
  */
-inline size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension);
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension);
 
 /** Get the DataLayoutDimension of a given index and layout.
  *
@@ -749,22 +235,7 @@ inline size_t get_data_layout_dimension_index(const DataLayout data_layout, cons
  *
  * @return The dimension which this index is requested for.
  */
-inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout data_layout, const size_t index);
-
-/** Calculate the normalization dimension index for a given normalization type
- *
- * @param[in] layout Data layout of the input and output tensor
- * @param[in] info   Normalization info
- *
- * @return Normalization dimension index
- */
-inline unsigned int get_normalization_dimension_index(DataLayout layout, const NormalizationLayerInfo &info)
-{
-    const unsigned int width_idx   = get_data_layout_dimension_index(layout, DataLayoutDimension::WIDTH);
-    const unsigned int channel_idx = get_data_layout_dimension_index(layout, DataLayoutDimension::CHANNEL);
-
-    return info.is_in_map() ? width_idx : channel_idx;
-}
+inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &data_layout, const size_t index);
 
 /** Calculate the number of output tiles required by Winograd Convolution layer. This utility function can be used by the Winograd input transform
  *  to know the number of tiles on the x and y direction
@@ -776,10 +247,17 @@ inline unsigned int get_normalization_dimension_index(DataLayout layout, const N
  *
  * @return the number of output tiles along the x and y directions of size "output_tile_size"
  */
-inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info)
+inline Size2D compute_winograd_convolution_tiles(const Size2D        &in_dims,
+                                                 const Size2D        &kernel_size,
+                                                 const Size2D        &output_tile_size,
+                                                 const PadStrideInfo &conv_info)
 {
-    int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
-    int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+    int num_tiles_x =
+        std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) /
+                  static_cast<float>(output_tile_size.width));
+    int num_tiles_y =
+        std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) /
+                  static_cast<float>(output_tile_size.height));
 
     // Clamp in case we provide paddings but we have 1D convolution
     num_tiles_x = std::min(num_tiles_x, static_cast<int>(in_dims.width));
@@ -808,40 +286,12 @@ inline T wrap_around(T x, T m)
  */
 inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value)
 {
-    for(unsigned int i = 0; i < coords.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < coords.num_dimensions(); ++i)
     {
         coords[i] = wrap_around(coords[i], max_value);
     }
     return coords;
 }
-
-/** Given an integer value, this function returns the next power of two
- *
- * @param[in] x Input value
- *
- * @return the next power of two
- */
-inline unsigned int get_next_power_two(unsigned int x)
-{
-    // Decrement by 1
-    x--;
-
-    // Shift right by 1
-    x |= x >> 1u;
-    // Shift right by 2
-    x |= x >> 2u;
-    // Shift right by 4
-    x |= x >> 4u;
-    // Shift right by 8
-    x |= x >> 8u;
-    // Shift right by 16
-    x |= x >> 16u;
-
-    // Increment by 1
-    x++;
-
-    return x;
-}
 } // namespace arm_compute
 
 #include "arm_compute/core/Helpers.inl"
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index 233d46bb86..60a21e9418 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,68 +22,19 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 
 #include <cmath>
 #include <numeric>
 
 namespace arm_compute
 {
-inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    // Calculate sampling position
-    float in_x = (x + 0.5f) * wr - 0.5f;
-    float in_y = (y + 0.5f) * hr - 0.5f;
-
-    // Get bounding box offsets
-    int x_from = std::floor(x * wr - 0.5f - in_x);
-    int y_from = std::floor(y * hr - 0.5f - in_y);
-    int x_to   = std::ceil((x + 1) * wr - 0.5f - in_x);
-    int y_to   = std::ceil((y + 1) * hr - 0.5f - in_y);
-
-    // Clamp position to borders
-    in_x = std::max(-1.f, std::min(in_x, static_cast<float>(width)));
-    in_y = std::max(-1.f, std::min(in_y, static_cast<float>(height)));
-
-    // Clamp bounding box offsets to borders
-    x_from = ((in_x + x_from) < -1) ? -1 : x_from;
-    y_from = ((in_y + y_from) < -1) ? -1 : y_from;
-    x_to   = ((in_x + x_to) > width) ? (width - in_x) : x_to;
-    y_to   = ((in_y + y_to) > height) ? (height - in_y) : y_to;
-
-    // Get pixel index
-    const int xi = std::floor(in_x);
-    const int yi = std::floor(in_y);
-
-    // Bounding box elements in each dimension
-    const int x_elements = (x_to - x_from + 1);
-    const int y_elements = (y_to - y_from + 1);
-    ARM_COMPUTE_ERROR_ON(x_elements == 0 || y_elements == 0);
-
-    // Sum pixels in area
-    int sum = 0;
-    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
-    {
-        const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
-        sum                = std::accumulate(ptr, ptr + x_elements, sum);
-    }
-
-    // Return average
-    return sum / (x_elements * y_elements);
-}
-
 template <size_t dimension>
 struct IncrementIterators
 {
     template <typename T, typename... Ts>
-    static void unroll(T &&it, Ts &&... iterators)
+    static void unroll(T &&it, Ts &&...iterators)
     {
-        auto increment = [](T && it)
-        {
-            it.increment(dimension);
-        };
+        auto increment = [](T &&it) { it.increment(dimension); };
         utility::for_each(increment, std::forward<T>(it), std::forward<Ts>(iterators)...);
     }
     static void unroll()
@@ -96,14 +47,14 @@ template <size_t dim>
 struct ForEachDimension
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         const auto &d = w[dim - 1];
 
-        for(auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators < dim - 1 >::unroll(iterators...))
+        for (auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators<dim - 1>::unroll(iterators...))
         {
             id.set(dim - 1, v);
-            ForEachDimension < dim - 1 >::unroll(w, id, lambda_function, iterators...);
+            ForEachDimension<dim - 1>::unroll(w, id, lambda_function, iterators...);
         }
     }
 };
@@ -112,7 +63,7 @@ template <>
 struct ForEachDimension<0>
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         ARM_COMPUTE_UNUSED(w, iterators...);
         lambda_function(id);
@@ -120,49 +71,60 @@ struct ForEachDimension<0>
 };
 
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators)
 {
     w.validate();
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(w[i].step() == 0);
     }
 
     Coordinates id;
-    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function), std::forward<Ts>(iterators)...);
+    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function),
+                                                              std::forward<Ts>(iterators)...);
 }
 
-inline constexpr Iterator::Iterator()
-    : _ptr(nullptr), _dims()
+inline constexpr Iterator::Iterator() : _ptr(nullptr), _dims()
 {
 }
 
-inline Iterator::Iterator(const ITensor *tensor, const Window &win)
-    : Iterator()
+inline Iterator::Iterator(const ITensor *tensor, const Window &win) : Iterator()
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->info() == nullptr);
 
-    const ITensorInfo *info    = tensor->info();
-    const Strides     &strides = info->strides_in_bytes();
+    initialize(tensor->info()->num_dimensions(), tensor->info()->strides_in_bytes(), tensor->buffer(),
+               tensor->info()->offset_first_element_in_bytes(), win);
+}
+
+inline Iterator::Iterator(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
+    : Iterator()
+{
+    initialize(num_dims, strides, buffer, offset, win);
+}
+
+inline void
+Iterator::initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(buffer == nullptr);
 
-    _ptr = tensor->buffer() + info->offset_first_element_in_bytes();
+    _ptr = buffer + offset;
 
     //Initialize the stride for each dimension and calculate the position of the first element of the iteration:
-    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    for (unsigned int n = 0; n < num_dims; ++n)
     {
         _dims[n]._stride = win[n].step() * strides[n];
-        std::get<0>(_dims)._dim_start += strides[n] * win[n].start();
+        std::get<0>(_dims)._dim_start += static_cast<size_t>(strides[n]) * win[n].start();
     }
 
     //Copy the starting point to all the dimensions:
-    for(unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
     {
         _dims[n]._dim_start = std::get<0>(_dims)._dim_start;
     }
 
-    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(win, info->num_dimensions());
+    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(win, num_dims);
 }
 
 inline void Iterator::increment(const size_t dimension)
@@ -171,13 +133,13 @@ inline void Iterator::increment(const size_t dimension)
 
     _dims[dimension]._dim_start += _dims[dimension]._stride;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
 }
 
-inline constexpr int Iterator::offset() const
+inline constexpr size_t Iterator::offset() const
 {
     return _dims.at(0)._dim_start;
 }
@@ -193,100 +155,12 @@ inline void Iterator::reset(const size_t dimension)
 
     _dims[dimension]._dim_start = _dims[dimension + 1]._dim_start;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
 }
 
-inline bool auto_init_if_empty(ITensorInfo       &info,
-                               const TensorShape &shape,
-                               int                num_channels,
-                               DataType           data_type,
-                               QuantizationInfo   quantization_info)
-{
-    if(info.tensor_shape().total_size() == 0)
-    {
-        info.set_data_type(data_type);
-        info.set_num_channels(num_channels);
-        info.set_tensor_shape(shape);
-        info.set_quantization_info(quantization_info);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
-{
-    if(info_sink.tensor_shape().total_size() == 0)
-    {
-        info_sink.set_data_type(info_source.data_type());
-        info_sink.set_num_channels(info_source.num_channels());
-        info_sink.set_tensor_shape(info_source.tensor_shape());
-        info_sink.set_quantization_info(info_source.quantization_info());
-        info_sink.set_data_layout(info_source.data_layout());
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
-{
-    if(info.tensor_shape().total_size() == 0)
-    {
-        info.set_tensor_shape(shape);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_format_if_unknown(ITensorInfo &info, Format format)
-{
-    if(info.data_type() == DataType::UNKNOWN)
-    {
-        info.set_format(format);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
-{
-    if(info.data_type() == DataType::UNKNOWN)
-    {
-        info.set_data_type(data_type);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
-{
-    if(info.data_layout() == DataLayout::UNKNOWN)
-    {
-        info.set_data_layout(data_layout);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
-{
-    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
-    {
-        info.set_quantization_info(quantization_info);
-        return true;
-    }
-
-    return false;
-}
-
 inline Coordinates index2coords(const TensorShape &shape, int index)
 {
     int num_elements = shape.total_size();
@@ -294,9 +168,9 @@ inline Coordinates index2coords(const TensorShape &shape, int index)
     ARM_COMPUTE_ERROR_ON_MSG(index < 0 || index >= num_elements, "Index has to be in [0, num_elements]!");
     ARM_COMPUTE_ERROR_ON_MSG(num_elements == 0, "Cannot create coordinate from empty shape!");
 
-    Coordinates coord{ 0 };
+    Coordinates coord{0};
 
-    for(int d = shape.num_dimensions() - 1; d >= 0; --d)
+    for (int d = shape.num_dimensions() - 1; d >= 0; --d)
     {
         num_elements /= shape[d];
         coord.set(d, index / num_elements);
@@ -315,7 +189,7 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     int index  = 0;
     int stride = 1;
 
-    for(unsigned int d = 0; d < coord.num_dimensions(); ++d)
+    for (unsigned int d = 0; d < coord.num_dimensions(); ++d)
     {
         index += coord[d] * stride;
         stride *= shape[d];
@@ -324,61 +198,23 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     return index;
 }
 
-inline size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
-
-    /* Return the index based on the data layout
-     * [N C H W]
-     * [3 2 1 0]
-     * [N H W C]
-    */
-    switch(data_layout_dimension)
-    {
-        case DataLayoutDimension::CHANNEL:
-            return (data_layout == DataLayout::NCHW) ? 2 : 0;
-            break;
-        case DataLayoutDimension::HEIGHT:
-            return (data_layout == DataLayout::NCHW) ? 1 : 2;
-            break;
-        case DataLayoutDimension::WIDTH:
-            return (data_layout == DataLayout::NCHW) ? 0 : 1;
-            break;
-        case DataLayoutDimension::BATCHES:
-            return 3;
-            break;
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR("Data layout index not supported!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
+    const auto &dims = get_layout_map().at(data_layout);
+    const auto &it   = std::find(dims.cbegin(), dims.cend(), data_layout_dimension);
+    ARM_COMPUTE_ERROR_ON_MSG(it == dims.cend(), "Invalid dimension for the given layout.");
+    return it - dims.cbegin();
 }
 
-inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout data_layout, const size_t index)
+inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &data_layout, const size_t index)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
-
-    /* Return the index based on the data layout
-    * [N C H W]
-    * [3 2 1 0]
-    * [N H W C]
-    */
-    switch(index)
-    {
-        case 0:
-            return (data_layout == DataLayout::NCHW) ? DataLayoutDimension::WIDTH : DataLayoutDimension::CHANNEL;
-            break;
-        case 1:
-            return (data_layout == DataLayout::NCHW) ? DataLayoutDimension::HEIGHT : DataLayoutDimension::WIDTH;
-            break;
-        case 2:
-            return (data_layout == DataLayout::NCHW) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::HEIGHT;
-            break;
-        case 3:
-            return DataLayoutDimension::BATCHES;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Index value not supported!");
-            break;
-    }
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the layout dimension for an unknown layout!");
+    const auto &dims = get_layout_map().at(data_layout);
+    ARM_COMPUTE_ERROR_ON_MSG(index >= dims.size(), "Invalid index for the given layout.");
+    return dims[index];
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
index 227d1c4bb2..9c9fb90915 100644
--- a/arm_compute/core/IAccessWindow.h
+++ b/arm_compute/core/IAccessWindow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,7 +100,10 @@ public:
      * @return a valid region.
      *
      */
-    virtual ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const = 0;
+    virtual ValidRegion compute_valid_region(const Window &window,
+                                             ValidRegion   input_valid_region,
+                                             bool          border_undefined,
+                                             BorderSize    border_size) const = 0;
 };
 
 /** Implementation of a rectangular access pattern. */
@@ -161,7 +164,10 @@ public:
      * @param[in] border_undefined   (Optional) Undefined borders are excluded from the valid region.
      * @param[in] border_size        (Optional) Size of the border around the XY-plane of the tensor.
      */
-    void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined = false, const BorderSize &border_size = BorderSize(0));
+    void set_valid_region(const Window      &window,
+                          const ValidRegion &input_valid_region,
+                          bool               border_undefined = false,
+                          const BorderSize  &border_size      = BorderSize(0));
 
     /** Compute the valid region based on access pattern, valid region of the inputs and border mode.
      *
@@ -189,7 +195,10 @@ public:
      * @return a valid region.
      *
      */
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
index c6a1499698..3471fc9a86 100644
--- a/arm_compute/core/IArray.h
+++ b/arm_compute/core/IArray.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,30 +25,24 @@
 #define ARM_COMPUTE_IARRAY_H
 
 #include "arm_compute/core/Error.h"
+
 #include <cstddef>
 #include <cstdint>
 
 namespace arm_compute
 {
-struct KeyPoint;
-struct Coordinates2D;
-struct DetectionWindow;
-class Size2D;
-
 /** Array of type T */
 template <class T>
 class IArray
 {
 public:
     /** Default constructor */
-    IArray()
-        : _num_values(0), _max_size(0) {};
+    IArray() : _num_values(0), _max_size(0){};
     /** Constructor: initializes an array which can contain up to max_num_points values
      *
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
-    IArray(size_t max_num_values)
-        : _num_values(0), _max_size(max_num_values)
+    IArray(size_t max_num_values) : _num_values(0), _max_size(max_num_values)
     {
     }
     /** Maximum number of values which can be stored in this array
@@ -78,7 +72,7 @@ public:
     bool push_back(const T &val)
     {
         ARM_COMPUTE_ERROR_ON(0 == _max_size);
-        if(_num_values >= max_num_values())
+        if (_num_values >= max_num_values())
         {
             _num_values = max_num_values() + 1;
             return false;
@@ -135,14 +129,6 @@ private:
     size_t _num_values;
     size_t _max_size;
 };
-/** Interface for Array of Key Points. */
-using IKeyPointArray = IArray<KeyPoint>;
-/** Interface for Array of 2D Coordinates. */
-using ICoordinates2DArray = IArray<Coordinates2D>;
-/** Interface for Array of Detection Windows. */
-using IDetectionWindowArray = IArray<DetectionWindow>;
-/** Interface for Array of 2D Sizes. */
-using ISize2DArray = IArray<Size2D>;
 /** Interface for Array of uint8s. */
 using IUInt8Array = IArray<uint8_t>;
 /** Interface for Array of uint16s. */
@@ -155,5 +141,5 @@ using IInt16Array = IArray<int16_t>;
 using IInt32Array = IArray<int32_t>;
 /** Interface for Array of floats. */
 using IFloatArray = IArray<float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_IARRAY_H */
diff --git a/arm_compute/core/IDevice.h b/arm_compute/core/IDevice.h
index 5cffe646d4..12efa91e19 100644
--- a/arm_compute/core/IDevice.h
+++ b/arm_compute/core/IDevice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,6 @@ enum class DeviceType
 {
     NEON,
     CL,
-    GLES
 };
 
 /** Interface for device object */
diff --git a/arm_compute/core/IDistribution.h b/arm_compute/core/IDistribution.h
deleted file mode 100644
index cd6f25fd47..0000000000
--- a/arm_compute/core/IDistribution.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IDISTRIBUTION_H
-#define ARM_COMPUTE_IDISTRIBUTION_H
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-/** Interface for distribution objects */
-class IDistribution
-{
-public:
-    /** Default virtual destructor */
-    virtual ~IDistribution() = default;
-    /** Returns the dimensions of the distribution.
-     *
-     * @note  This is fixed to 1-dimensional distribution for now.
-     * @return Dimensions of the distribution.
-     */
-    virtual size_t dimensions() const = 0;
-    /** Returns the total size in bytes of the distribution.
-     *
-     * @return Total size of the distribution in bytes.
-     */
-    virtual size_t size() const = 0;
-    /** Returns a pointer to the start of the distribution.
-     * Other elements of the array can be accessed using buffer()[idx] for 0 <= idx < num_bins()
-     *
-     * @return Pointer to the start of the distribution.
-     */
-    virtual uint32_t *buffer() const = 0;
-    /** Clears the distribution by setting every element to zero. */
-    void clear() const;
-};
-}
-#endif /* ARM_COMPUTE_IDISTRIBUTION_H */
diff --git a/arm_compute/core/IDistribution1D.h b/arm_compute/core/IDistribution1D.h
deleted file mode 100644
index 081ba580db..0000000000
--- a/arm_compute/core/IDistribution1D.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IDISTRIBUTION1D_H
-#define ARM_COMPUTE_IDISTRIBUTION1D_H
-
-#include "arm_compute/core/IDistribution.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-/** 1D Distribution interface */
-class IDistribution1D : public IDistribution
-{
-public:
-    /** Constructor: Creates a 1D Distribution of a consecutive interval [offset, offset + range - 1]
-     *               defined by a start offset and valid range, divided equally into num_bins parts.
-     *
-     * @param[in] num_bins The number of bins the distribution is divided in.
-     * @param[in] offset   The start of the values to use.
-     * @param[in] range    The total number of the consecutive values of the distribution interval.
-     */
-    IDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
-    /** Returns the number of bins that the distribution has.
-     *
-     * @return Number of bins of the distribution.
-     */
-    size_t num_bins() const;
-    /** Returns the offset of the distribution.
-     *
-     * @return Offset of the distribution.
-     */
-    int32_t offset() const;
-    /** Returns the range of the distribution.
-     *
-     * @return Range of the distribution.
-     */
-    uint32_t range() const;
-    /** Returns the window of the distribution, which is the range divided by the number of bins.
-     *
-     * @note If range is not divided by the number of bins then it is invalid.
-     *
-     * @return Window of the distribution.
-     */
-    uint32_t window() const;
-    /** Sets the range of the distribution.
-     *
-     * @param[in] range New range of the distribution to be set.
-     */
-    void set_range(uint32_t range);
-
-    // Inherited methods overridden:
-    size_t size() const override;
-    size_t dimensions() const override;
-
-private:
-    size_t   _num_bins; /**< Number of bins. */
-    int32_t  _offset;   /**< Offset, which indicate the start of the usable values. */
-    uint32_t _range;    /**< The total number of consecutive values of the distribution interval */
-};
-}
-#endif /* ARM_COMPUTE_IDISTRIBUTION1D_H */
diff --git a/arm_compute/core/IHOG.h b/arm_compute/core/IHOG.h
deleted file mode 100644
index bf8bd73087..0000000000
--- a/arm_compute/core/IHOG.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IHOG_H
-#define ARM_COMPUTE_IHOG_H
-
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-class HOGInfo;
-/** Interface for HOG data-object */
-class IHOG
-{
-public:
-    /** Interface to be implemented by the child class to return the HOG's metadata
-     *
-     * @return A pointer to the HOG's metadata.
-     */
-    virtual const HOGInfo *info() const = 0;
-    /** Default virtual destructor */
-    virtual ~IHOG() = default;
-    /** Pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     *
-     * @note Other elements of the array can be accessed using descriptor()[idx] for idx=[0, descriptor_size() - 1]
-     *
-     * @return A pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     */
-    virtual float *descriptor() const = 0;
-};
-}
-#endif /* ARM_COMPUTE_IHOG_H */
diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
index cb1ddb1d7f..403a2c724e 100644
--- a/arm_compute/core/IKernel.h
+++ b/arm_compute/core/IKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,11 @@ public:
      * @return The maximum window the kernel can be executed on.
      */
     const Window &window() const;
+    /** Function to check if the embedded window of this kernel has been configured
+     *
+     * @return True if the windows has been configured
+     */
+    bool is_window_configured() const;
 
 protected:
     /** Configure the kernel's window
@@ -68,5 +73,5 @@ protected:
 private:
     Window _window;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IKERNEL_H */
diff --git a/arm_compute/core/ILut.h b/arm_compute/core/ILut.h
deleted file mode 100644
index d1a03af969..0000000000
--- a/arm_compute/core/ILut.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ILUT_H
-#define ARM_COMPUTE_ILUT_H
-
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Lookup Table object interface. */
-class ILut
-{
-public:
-    /** Default virtual destructor */
-    virtual ~ILut() = default;
-    /** Returns the total number of elements in the LUT.
-     *
-     * @return Total number of elements.
-     */
-    virtual size_t num_elements() const = 0;
-    /** Indicates the offset that needs to be applied to the raw index before performing a lookup in the LUT.
-     *
-     * @return The normalization offset.
-     */
-    virtual uint32_t index_offset() const = 0;
-    /** Returns the total size in bytes of the LUT.
-     *
-     * @return Total size of the LUT in bytes.
-     */
-    virtual size_t size_in_bytes() const = 0;
-    /** Returns the type of the LUT.
-     *
-     * @return The type of the LUT.
-     */
-    virtual DataType type() const = 0;
-    /** Returns a pointer to the start of the LUT.
-     * Other elements of the LUT can be accessed using buffer()[idx] for 0 <= idx < num_elements().
-     *
-     * @return Pointer to the start of the lut.
-     */
-    virtual uint8_t *buffer() const = 0;
-    /** Clears the LUT by setting every element to zero. */
-    virtual void clear() = 0;
-};
-}
-#endif /* ARM_COMPUTE_ILUT_H */
diff --git a/arm_compute/core/IMultiHOG.h b/arm_compute/core/IMultiHOG.h
deleted file mode 100644
index ab79fac154..0000000000
--- a/arm_compute/core/IMultiHOG.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IMULTIHOG_H
-#define ARM_COMPUTE_IMULTIHOG_H
-
-#include "arm_compute/core/IHOG.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Interface for storing multiple HOG data-objects */
-class IMultiHOG
-{
-public:
-    /** Default destructor */
-    virtual ~IMultiHOG() = default;
-    /** The number of HOG models stored
-     *
-     * @return The number of HOG models stored
-     */
-    virtual size_t num_models() const = 0;
-    /** Return a pointer to the requested HOG model
-     *
-     * @param[in] index The index of the wanted HOG model.
-     *
-     *  @return A pointer pointed to the HOG model
-     */
-    virtual IHOG *model(size_t index) = 0;
-    /** Return a const pointer to the requested HOG model
-     *
-     * @param[in] index The index of the wanted HOG model.
-     *
-     *  @return A const pointer pointed to the HOG model
-     */
-    virtual const IHOG *model(size_t index) const = 0;
-};
-}
-
-#endif /* ARM_COMPUTE_IMULTIHOG_H */
diff --git a/arm_compute/core/IMultiImage.h b/arm_compute/core/IMultiImage.h
deleted file mode 100644
index 3abdfed8a8..0000000000
--- a/arm_compute/core/IMultiImage.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IMULTIIMAGE_H
-#define ARM_COMPUTE_IMULTIIMAGE_H
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-class MultiImageInfo;
-
-/** Interface for multi-planar images */
-class IMultiImage
-{
-public:
-    /** Destructor */
-    virtual ~IMultiImage() = default;
-    /** Interface to be implemented by the child class to return the multi-planar image's metadata
-     *
-     * @return A pointer to the image's metadata.
-     */
-    virtual const MultiImageInfo *info() const = 0;
-    /** Return a pointer to the requested plane of the image.
-     *
-     * @param[in] index The index of the wanted planed.
-     *
-     *  @return A pointer pointed to the plane
-     */
-    virtual IImage *plane(unsigned int index) = 0;
-    /** Return a constant pointer to the requested plane of the image.
-     *
-     * @param[in] index The index of the wanted planed.
-     *
-     *  @return A constant pointer pointed to the plane
-     */
-    virtual const IImage *plane(unsigned int index) const = 0;
-};
-}
-#endif /*ARM_COMPUTE_IMULTIIMAGE_H */
diff --git a/arm_compute/core/IPyramid.h b/arm_compute/core/IPyramid.h
deleted file mode 100644
index b2a74656b6..0000000000
--- a/arm_compute/core/IPyramid.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IPYRAMID_H
-#define ARM_COMPUTE_IPYRAMID_H
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Interface for pyramid data-object */
-class IPyramid
-{
-public:
-    /** Default virtual destructor */
-    virtual ~IPyramid() = default;
-    /** Interface to be implemented by the child class to return the Pyramid's metadata
-     *
-     * @return A pointer to the Pyramid's metadata.
-     */
-    virtual const PyramidInfo *info() const = 0;
-    /** Retrieves a level of the pyramid as a ITensor pointer
-     *
-     * @param[in] index The index of the level, such that index is less than levels.
-     *
-     *  @return An ITensor pointer
-     */
-    virtual ITensor *get_pyramid_level(size_t index) const = 0;
-};
-}
-
-#endif /* ARM_COMPUTE_IPYRAMID_H */
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
index 501279eb25..aad8313261 100644
--- a/arm_compute/core/ITensor.h
+++ b/arm_compute/core/ITensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@ namespace arm_compute
 {
 class Coordinates;
 
-/** Interface for NEON tensor */
+/** Interface for CPU tensor */
 class ITensor
 {
 public:
@@ -90,11 +90,13 @@ public:
     bool is_used() const;
     /** Marks a tensor as unused */
     void mark_as_unused() const;
+    /** Marks a tensor as used */
+    void mark_as_used() const;
 
 private:
-    mutable bool _is_used = { true }; /**< Flag that marks if the tensor is used or not */
+    mutable bool _is_used = {true}; /**< Flag that marks if the tensor is used or not */
 };
 
 using IImage = ITensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ITENSOR_H */
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index f2b4c155aa..c42f4b57a1 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,18 +28,46 @@
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ICloneable.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 
+#include "support/ICloneable.h"
+
 #include <cstddef>
 
 namespace arm_compute
 {
+class QuantizationInfo;
+// Note: Any changes to the fields of the class below that have setters should be mirrored
+// (if possible) in the auto_init_if_empty function in AutoConfiguration.h
+
 /** Store the tensor's metadata */
 class ITensorInfo : public misc::ICloneable<ITensorInfo>
 {
 public:
+    using TensorDimsState = std::vector<int>;
+    /** An id that uniquely identifies an ITensorInfo within some domain (e.g. a workload)
+     */
+    using Id = int32_t;
+    /** An invalid tensor id within a domain */
+    static constexpr Id invalid_tensor_id = 0;
+    /** Get the value representing dynamic dimension state
+     *
+     * @return Value representing dynamic dimension state
+     *
+     */
+    static constexpr int32_t get_dynamic_state_value()
+    {
+        return _dynamic_dimension;
+    }
+    /** Get the value representing static dimension state
+     *
+     * @return Value representing static dimension state
+     *
+     */
+    static constexpr int32_t get_static_state_value()
+    {
+        return _static_dimension;
+    }
     /** Default virtual destructor */
     virtual ~ITensorInfo() = default;
     /** Set the data type to the specified value.
@@ -81,6 +109,17 @@ public:
      * @return Reference to this ITensorInfo object
      */
     virtual ITensorInfo &set_tensor_shape(const TensorShape &shape) = 0;
+    /** Set the state for each dimension of the tensor
+     *
+     * This sets the state of each dimension of the shape in terms of dynamic behavior using -1 where appropriate.
+     * The index in the state is a 1 to 1 mapping with the shape dimension index.
+     * For example if you want to express [?, 3, 3] as a dynamic input then [-1, 3, 3] has to be set as a state
+     *
+     * @param[in] state Tensor dimensions state
+     *
+     * @return Reference to this ITensorInfo object
+     */
+    virtual ITensorInfo &set_tensor_dims_state(const TensorDimsState &state) = 0;
     /** Set the quantization settings (scale and offset) of the tensor.
      *
      * @param[in] quantization_info QuantizationInfo containing the scale and offset
@@ -107,6 +146,17 @@ public:
      * @return True if the strides or the offset to the first element have changed.
      */
     virtual bool auto_padding() = 0;
+    /** Set the lock paddings flag of the tensor.
+     * It should be set to True, when the tensor could be mapped to camera or frame buffer.
+     *
+     * @return Reference to this ITensorInfo object
+     */
+    virtual ITensorInfo &set_lock_paddings(bool flag) = 0;
+    /** Get the lock paddings flag value
+     *
+     * @return lock paddings flag value
+     */
+    virtual bool lock_paddings() const = 0;
     /** Update the offset to the first element, the strides and the total size.
      *
      * @note This function can only increase the offset, strides and total size.
@@ -170,6 +220,11 @@ public:
      * @return A vector with the size for each dimension of the tensor
      */
     virtual const TensorShape &tensor_shape() const = 0;
+    /** State of each dimension of the tensor shape
+     *
+     * @return A vector with the state for each dimension of the tensor, where -1 specifies dynamic dimension
+     */
+    virtual const TensorDimsState &tensor_dims_state() const = 0;
     /** Data type used for each element of the tensor
      *
      * @return Tensor data type
@@ -205,6 +260,11 @@ public:
      * @return True if its dynamic else false
      */
     virtual bool is_dynamic() const = 0;
+    /** Flag indicating whether the values of the tensor are constant, meaning that they can change on kernel/function execution.
+     *
+     * @return True if values are constant else false
+     */
+    virtual bool are_values_constant() const = 0;
     /** Set the flag whether the tensor size can be changed.
      *
      * @param[in] is_resizable Flag that marks the tensor if it can be changed or not.
@@ -212,13 +272,13 @@ public:
      * @return Reference to this ITensorInfo object
      */
     virtual ITensorInfo &set_is_resizable(bool is_resizable) = 0;
-    /** Set the flag whether the tensor size is dynamic.
+    /** Set the flag whether the tensor values can change during kernel/function execution.
      *
-     * @param[in] is_dynamic Flag that marks the tensor if it's dynamic.
+     * @param[in] are_values_constant Flag that marks the tensor values if they can be changed or not.
      *
      * @return Reference to this ITensorInfo object
      */
-    virtual ITensorInfo &set_is_dynamic(bool is_dynamic) = 0;
+    virtual ITensorInfo &set_are_values_constant(bool are_values_constant) = 0;
     /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
      *
      * @return The valid region.
@@ -240,7 +300,20 @@ public:
     * @return A DataLayout containing the layout data information.
     */
     virtual DataLayout data_layout() const = 0;
-
+    /** Get the workload tensor id of the tensor.
+    *
+    * @return Workload tensor id of the tensor
+    */
+    virtual Id id() const = 0;
+    /** Set the tensor id
+    */
+    virtual ITensorInfo &set_id(ITensorInfo::Id id) = 0;
+    /** Check if the tensor id is valid
+     */
+    bool has_valid_id() const
+    {
+        return id() != invalid_tensor_id;
+    }
     /** If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of
      * the broadcasted valid regions of the tensors.
      *
@@ -256,23 +329,23 @@ public:
      * not broadcast compatible.
      */
     template <typename... Infos>
-    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &... infos)
+    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &...infos)
     {
         TensorShape bc_shape = TensorShape::broadcast_shape(infos.tensor_shape()...);
-        ValidRegion bc_valid_region{ Coordinates(), bc_shape };
+        ValidRegion bc_valid_region{Coordinates(), bc_shape};
 
-        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo & info)
+        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo &info)
         {
-            if(info.num_dimensions() != 0)
+            if (info.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
+                for (size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
                 {
                     const bool is_broadcast = (info.tensor_shape()[d] == 1);
 
                     const int    anchor_max = std::max(bc_valid_region.anchor[d], info.valid_region().anchor[d]);
                     const size_t valid_min  = std::min(bc_valid_region.shape[d], info.valid_region().shape[d]);
 
-                    if(!is_broadcast || (valid_min == 0))
+                    if (!is_broadcast || (valid_min == 0))
                     {
                         bc_valid_region.anchor.set(d, anchor_max);
                         bc_valid_region.shape.set(d, valid_min);
@@ -285,6 +358,10 @@ public:
 
         return std::pair<TensorShape, ValidRegion>(bc_shape, bc_valid_region);
     }
+
+private:
+    static constexpr int32_t _dynamic_dimension = -1;
+    static constexpr int32_t _static_dimension  = 0;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/ITensorPack.h b/arm_compute/core/ITensorPack.h
new file mode 100644
index 0000000000..f456c50769
--- /dev/null
+++ b/arm_compute/core/ITensorPack.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ITENSORPACK_H
+#define ARM_COMPUTE_ITENSORPACK_H
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Tensor packing service */
+class ITensorPack
+{
+public:
+    struct PackElement
+    {
+        PackElement() = default;
+        PackElement(int id, ITensor *tensor) : id(id), tensor(tensor), ctensor(nullptr)
+        {
+        }
+        PackElement(int id, const ITensor *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
+        {
+        }
+
+        int            id{-1};
+        ITensor       *tensor{nullptr};
+        const ITensor *ctensor{nullptr};
+    };
+
+public:
+    /** Default Constructor */
+    ITensorPack() = default;
+    /**  Initializer list Constructor */
+    ITensorPack(std::initializer_list<PackElement> l);
+    /** Add tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(int id, ITensor *tensor);
+
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(int id, const ITensor *tensor);
+
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_const_tensor(int id, const ITensor *tensor);
+    /** Get tensor of a given id from the pac
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is non-const else nullptr
+     */
+    ITensor *get_tensor(int id);
+    /** Get constant tensor of a given id
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is const else nullptr
+     */
+    const ITensor *get_const_tensor(int id) const;
+    /** Remove the tensor stored with the given id
+     *
+     * @param[in] id ID of tensor to remove
+     */
+    void remove_tensor(int id);
+    /** Pack size accessor
+     *
+     * @return Number of tensors registered to the pack
+     */
+    size_t size() const;
+    /** Checks if pack is empty
+     *
+     * @return True if empty else false
+     */
+    bool empty() const;
+
+private:
+    std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ITENSORPACK_H */
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index de08288dec..168a06a55c 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,113 +21,139 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H
-#define ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H
+#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 
 namespace arm_compute
 {
 /** Descriptor for FFT scale kernels */
 struct FFTScaleKernelInfo
 {
-    float scale{ 0.f };      /**< Axis to perform the kernel on. */
-    bool  conjugate{ true }; /**< Flag to conjugate the output/ */
+    float scale{0.f};      /**< Axis to perform the kernel on. */
+    bool  conjugate{true}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor for FFT digit reverse kernels */
 struct FFTDigitReverseKernelInfo
 {
-    unsigned int axis{ 0 };          /**< Axis to perform the kernel on. */
-    bool         conjugate{ false }; /**< Flag to conjugate the output/ */
+    unsigned int axis{0};          /**< Axis to perform the kernel on. */
+    bool         conjugate{false}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor used by the FFT core kernels */
 struct FFTRadixStageKernelInfo
 {
-    unsigned int axis{ 0 };               /**< Axis to run the kernel on. */
-    unsigned int radix{ 0 };              /**< Radix to use. */
-    unsigned int Nx{ 0 };                 /**< Nx coefficient. */
-    bool         is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
+    unsigned int axis{0};               /**< Axis to run the kernel on. */
+    unsigned int radix{0};              /**< Radix to use. */
+    unsigned int Nx{0};                 /**< Nx coefficient. */
+    bool         is_first_stage{false}; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
 };
 
+class ITensorInfo;
 /** Descriptor used by the GEMM kernels */
 struct GEMMKernelInfo
 {
     GEMMKernelInfo() = default;
-    GEMMKernelInfo(
-        unsigned int        im,
-        unsigned int        in,
-        unsigned int        ik,
-        unsigned int        idepth_output_gemm3d,
-        bool                ireinterpret_input_as_3d,
-        bool                ibroadcast_bias,
-        bool                ifp_mixed_precision,
-        ActivationLayerInfo iactivation_info,
-        int                 inmult_transpose1xW_width,
-        int                 imult_interleave4x4_height,
-        GEMMLHSMatrixInfo   ilhs_info,
-        GEMMRHSMatrixInfo   irhs_info,
-        int32_t             ina_offset,
-        int32_t             inb_offset)
-        : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
-          activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), rhs_info(irhs_info),
-          a_offset(ina_offset), b_offset(inb_offset)
+    GEMMKernelInfo(unsigned int        im,
+                   unsigned int        in,
+                   unsigned int        ik,
+                   unsigned int        idepth_output_gemm3d,
+                   bool                ireinterpret_input_as_3d,
+                   bool                ibroadcast_bias,
+                   bool                ifp_mixed_precision,
+                   bool                ihas_pad_y,
+                   ActivationLayerInfo iactivation_info,
+                   int                 inmult_transpose1xW_width,
+                   int                 imult_interleave4x4_height,
+                   GEMMLHSMatrixInfo   ilhs_info,
+                   GEMMRHSMatrixInfo   irhs_info,
+                   int32_t             ina_offset,
+                   int32_t             inb_offset)
+        : m(im),
+          n(in),
+          k(ik),
+          depth_output_gemm3d(idepth_output_gemm3d),
+          reinterpret_input_as_3d(ireinterpret_input_as_3d),
+          broadcast_bias(ibroadcast_bias),
+          fp_mixed_precision(ifp_mixed_precision),
+          has_pad_y(ihas_pad_y),
+          activation_info(iactivation_info),
+          mult_transpose1xW_width(inmult_transpose1xW_width),
+          mult_interleave4x4_height(imult_interleave4x4_height),
+          lhs_info(ilhs_info),
+          rhs_info(irhs_info),
+          a_offset(ina_offset),
+          b_offset(inb_offset)
     {
     }
 
-    unsigned int            m{ 0 };                           /**< Number of LHS rows*/
-    unsigned int            n{ 0 };                           /**< Number of RHS columns*/
-    unsigned int            k{ 0 };                           /**< Number of LHS columns or RHS rows */
-    unsigned int            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
-    bool                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
-    bool                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
-    bool                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
-    ActivationLayerInfo     activation_info{};                /**< Activation function to perform after the matrix multiplication */
-    int                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
-    int                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
-    GEMMLHSMatrixInfo       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
-    GEMMRHSMatrixInfo       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
-    int32_t                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
-    int32_t                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
-    GEMMLowpOutputStageInfo output_stage{};                   /**< GEMMLowp output stage information */
+    unsigned int m{0};                           /**< Number of LHS rows*/
+    unsigned int n{0};                           /**< Number of RHS columns*/
+    unsigned int k{0};                           /**< Number of LHS columns or RHS rows */
+    unsigned int depth_output_gemm3d{0};         /**< Depth of the output tensor in case is reinterpreted as 3D */
+    bool         reinterpret_input_as_3d{false}; /**< Flag used to reinterpret the input as 3D */
+    bool         broadcast_bias{false};          /**< Flag used to broadcast the bias addition */
+    bool fp_mixed_precision{false}; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+    bool has_pad_y{
+        false}; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+    ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
+    int mult_transpose1xW_width{1};        /**< Multiplication factor for the width of the 1xW transposed block */
+    int mult_interleave4x4_height{1};      /**< Multiplication factor for the height of the 4x4 interleaved block */
+    GEMMLHSMatrixInfo
+        lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+    GEMMRHSMatrixInfo       rhs_info{};     /**< RHS matrix information used for reshaping the RHS matrix */
+    int32_t                 a_offset{0};    /**< Offset to be added to each element of the matrix A */
+    int32_t                 b_offset{0};    /**< Offset to be added to each element of the matrix B */
+    GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
 };
 
-/** Descriptor used by the depthwise convolution kernels */
-struct DWCKernelInfo
+/** Compute descriptor used by the depthwise convolution native kernel */
+struct DWCComputeKernelInfo
 {
-    ActivationLayerInfo activation_info{}; /**< Activation function to perform after the depthwise convolution */
+    unsigned int n0{1};                             /**< Number of columns processed by each thread */
+    unsigned int m0{1};                             /**< Number of rows processed by each thread */
+    bool         export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
 };
 
-/** Descriptor used by the depthwise convolution kernels to retrieve the number of output elements processed by each thread */
-struct DWCWeightsKernelInfo
+/** Compute descriptor used by the direct convolution kernel */
+struct DirectConvComputeKernelInfo
 {
-    unsigned int n0{ 0 }; /**< Number of columns processed by each thread */
+    int32_t m0{1}; /**< Number of rows to be processed by the kernel */
+    int32_t n0{1}; /**< Number of columns to be processed by the kernel */
+    int32_t k0{1}; /**< Number of partial accumulations to be processed in a single iteration by the kernel */
+    bool    export_weights_to_cl_image{false}; /**< Flag to export the weights to cl_image */
+    bool    export_output_to_cl_image{false};  /**< Flag to export the output to cl_image */
+    bool    export_input_to_cl_image{false};   /**< Flag to export the input to cl_image */
 };
 
 /** Descriptor used by the softmax kernels */
 struct SoftmaxKernelInfo
 {
-    float    beta{ 1.f };                          /**< A scaling factor for the exponent with default value 1.0 */
-    bool     is_log{ false };                      /**< Flag used to perform Log Softmax operation */
-    DataType input_data_type{ DataType::UNKNOWN }; /**< Input tensor data type */
+    float    beta{1.f};                          /**< A scaling factor for the exponent with default value 1.0 */
+    bool     is_log{false};                      /**< Flag used to perform Log Softmax operation */
+    DataType input_data_type{DataType::UNKNOWN}; /**< Input tensor data type */
+    int32_t  axis{0};                            /**< The dimension in which to apply softmax. */
 };
 
 /** Descriptor used by the direct convolution layer output stage kernels */
 struct DirectConvolutionLayerOutputStageKernelInfo
 {
-    int32_t  result_fixedpoint_multiplier{ 0 };     /**< Result output stage multiplier used for quantizing */
-    int32_t  result_shift{ 0 };                     /**< Result output stage shift used for quantizing */
-    int32_t  result_offset_after_shift{ 0 };        /**< Result offset used for quantizing */
-    DataType output_data_type{ DataType::UNKNOWN }; /**< Output tensor data type to use if the output is not initialized */
+    int32_t  result_fixedpoint_multiplier{0}; /**< Result output stage multiplier used for quantizing */
+    int32_t  result_shift{0};                 /**< Result output stage shift used for quantizing */
+    int32_t  result_offset_after_shift{0};    /**< Result offset used for quantizing */
+    DataType output_data_type{
+        DataType::UNKNOWN}; /**< Output tensor data type to use if the output is not initialized */
 };
 
 struct InstanceNormalizationLayerKernelInfo
 {
     /** Default constructor */
-    InstanceNormalizationLayerKernelInfo()
-        : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
+    InstanceNormalizationLayerKernelInfo() : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
     {
     }
     /** Constructor
@@ -164,10 +190,10 @@ struct GEMMLowpReductionKernelInfo
     {
     }
 
-    int32_t k{ 0 };                 /**< Number of matrix columns/rows */
-    bool    is_reshaped{ false };   /**< True if the input tensor has been reshaped */
-    int32_t scalar{ 0 };            /**< Scalar value to multiply each reduced column/row by */
-    bool    mul_by_scalar{ false }; /**< True if each column/row reduction has to be multiplied by a scalar value */
+    int32_t k{0};                 /**< Number of matrix columns/rows */
+    bool    is_reshaped{false};   /**< True if the input tensor has been reshaped */
+    int32_t scalar{0};            /**< Scalar value to multiply each reduced column/row by */
+    bool    mul_by_scalar{false}; /**< True if each column/row reduction has to be multiplied by a scalar value */
 };
 
 struct ScaleKernelInfo
@@ -180,19 +206,22 @@ struct ScaleKernelInfo
      * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
      * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
      * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in] data_layout           (Optional) Data layout used by the layer. Defaults to @ref DataLayout::UNKNOWN
      */
     ScaleKernelInfo(InterpolationPolicy interpolation_policy,
                     BorderMode          border_mode,
                     PixelValue          constant_border_value = PixelValue(),
                     SamplingPolicy      sampling_policy       = SamplingPolicy::CENTER,
                     bool                use_padding           = true,
-                    bool                align_corners         = false)
-        : interpolation_policy{ interpolation_policy },
-          border_mode{ border_mode },
-          constant_border_value{ constant_border_value },
-          sampling_policy{ sampling_policy },
-          use_padding{ use_padding },
-          align_corners{ align_corners }
+                    bool                align_corners         = false,
+                    DataLayout          data_layout           = DataLayout::UNKNOWN) noexcept
+        : interpolation_policy{interpolation_policy},
+          border_mode{border_mode},
+          constant_border_value{constant_border_value},
+          sampling_policy{sampling_policy},
+          use_padding{use_padding},
+          align_corners{align_corners},
+          data_layout{data_layout}
     {
     }
 
@@ -202,6 +231,23 @@ struct ScaleKernelInfo
     SamplingPolicy      sampling_policy;       /**< Sampling policy used by the interpolation. */
     bool                use_padding;           /**< Indication of using padding */
     bool                align_corners;         /**< Align corners of input and output */
+    DataLayout          data_layout;           /**< Data layout to use */
+};
+
+struct MatMulKernelInfo
+{
+    MatMulKernelInfo() = default;
+    MatMulKernelInfo(
+        bool adj_lhs, bool adj_rhs, int m0 = 1, int n0 = 1, int k0 = 1, bool export_rhs_to_cl_image = false)
+        : adj_lhs{adj_lhs}, adj_rhs{adj_rhs}, m0{m0}, n0{n0}, k0{k0}, export_rhs_to_cl_image{export_rhs_to_cl_image}
+    {
+    }
+    bool adj_lhs{false};                /**< Get Adjoint LHS flag value */
+    bool adj_rhs{false};                /**< Get Adjoint RHS flag value */
+    int  m0{1};                         /**< Number of output rows processed by each work-item*/
+    int  n0{1};                         /**< Number of output columns processed by each work-item*/
+    int  k0{1};                         /**< Number of inner accumulations */
+    bool export_rhs_to_cl_image{false}; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H */
+#endif // ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
diff --git a/arm_compute/core/Log.h b/arm_compute/core/Log.h
index 1515557f4c..03b861f765 100644
--- a/arm_compute/core/Log.h
+++ b/arm_compute/core/Log.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,11 +34,11 @@
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()                                   \
     do                                                                             \
     {                                                                              \
-        if(arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr)  \
+        if (arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr) \
         {                                                                          \
             arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
         }                                                                          \
-    } while(false)
+    } while (false)
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
@@ -53,7 +53,7 @@
     {                                                \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();    \
         ARM_COMPUTE_LOG_MSG("CORE", log_level, msg); \
-    } while(false)
+    } while (false)
 
 /** Log a message with format to the core system logger
  *
@@ -66,7 +66,7 @@
     {                                                                         \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                             \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("CORE", log_level, fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log a stream to the core system logger
  *
@@ -78,7 +78,7 @@
     {                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();      \
         ARM_COMPUTE_LOG_STREAM("CORE", log_level, ss); \
-    } while(false)
+    } while (false)
 
 /** Log information level message to the core system logger
  *
@@ -89,7 +89,7 @@
     {                                                                        \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                            \
         ARM_COMPUTE_LOG_MSG_CORE(arm_compute::logging::LogLevel::INFO, msg); \
-    } while(false)
+    } while (false)
 
 /** Log information level formatted message to the core system logger
  *
@@ -101,7 +101,7 @@
     {                                                                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                                                      \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(arm_compute::logging::LogLevel::INFO, #fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log information level stream to the core system logger
  *
@@ -112,6 +112,6 @@
     {                                                                          \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                              \
         ARM_COMPUTE_LOG_STREAM_CORE(arm_compute::logging::LogLevel::INFO, ss); \
-    } while(false)
+    } while (false)
 
 #endif /* ARM_COMPUTE_LOGGING_MACROS_H */
diff --git a/arm_compute/core/MultiImageInfo.h b/arm_compute/core/MultiImageInfo.h
deleted file mode 100644
index fcd7ba744d..0000000000
--- a/arm_compute/core/MultiImageInfo.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MULTIIMAGEINFO_H
-#define ARM_COMPUTE_MULTIIMAGEINFO_H
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-/** Store the multi-planar image's metadata */
-class MultiImageInfo
-{
-public:
-    /** Constructor */
-    MultiImageInfo();
-    /** Initialize the metadata structure with the given parameters
-     *
-     * @param[in] width  Width of the image (in number of pixels)
-     * @param[in] height Height of the image (in number of pixels)
-     * @param[in] format Colour format of the image.
-     */
-    void init(unsigned int width, unsigned int height, Format format);
-    /** Colour format of the image
-     *
-     * @return Colour format of the image
-     */
-    Format format() const;
-    /** Width in pixels
-     *
-     * @return The width in pixels
-     */
-    unsigned int width() const;
-    /** Height in pixels
-     *
-     * @return The height in pixels
-     */
-    unsigned int height() const;
-
-protected:
-    unsigned int _width;
-    unsigned int _height;
-    Format       _format;
-};
-}
-#endif /*ARM_COMPUTE_MULTIIMAGEINFO_H */
diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
deleted file mode 100644
index 5d9c1ec1e2..0000000000
--- a/arm_compute/core/NEON/INESimpleKernel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_INESIMPLEKERNEL_H
-#define ARM_COMPUTE_INESIMPLEKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
-
-namespace arm_compute
-{
-/** Interface for simple NEON kernels having 1 tensor input and 1 tensor output */
-using INESimpleKernel = ICPPSimpleKernel;
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_INESIMPLEKERNEL_H */
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
deleted file mode 100644
index e4f4250d16..0000000000
--- a/arm_compute/core/NEON/NEAsymm.h
+++ /dev/null
@@ -1,760 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEASYMM_H
-#define ARM_COMPUTE_NEASYMM_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-using qasymm8x8_t   = uint8x8_t;   /**< 8 bit quantized asymmetric vector with 8 elements */
-using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */
-using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */
-using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */
-using qasymm8x16_t  = uint8x16_t;  /**< 8 bit quantized asymmetric vector with 16 elements */
-
-using qasymm8x8_signed_t   = int8x8_t;   /**< 8 bit quantized signed asymmetric vector with 8 elements */
-using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */
-using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */
-using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */
-using qasymm8x16_signed_t  = int8x16_t;  /**< 8 bit quantized signed asymmetric vector with 16 elements */
-
-/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
- *
- * vd*vs + vo
- *
- * @param[in] vd Input vector value in QASYMM8 format
- * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
- * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
- *
- * @return A 16-component vector in QASYMM8 format, saturated to fit
- */
-uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
-
-/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
- *
- * vd*vs + vo
- *
- * @param[in] vd Input vector value in QASYMM8_SIGNED format
- * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
- * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
- *
- * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
- */
-int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
-
-/** Performs final quantization step on 16 elements
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_u8                        Relu lower bound
- * @param max_u8                        Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                 int          result_fixedpoint_multiplier,
-                                 int32_t      result_shift,
-                                 int32x4_t    result_offset_after_shift_s32,
-                                 uint8x16_t   min_u8,
-                                 uint8x16_t   max_u8)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    if(result_shift < 0)
-    {
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
-        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
-        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
-
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-
-        // Round to the nearest division by a power-of-two using result_shift_s32
-        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
-        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
-        in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
-        in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
-    }
-
-    // Add the offset terms
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to U8
-    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_u8 = vmaxq_u8(out_u8, min_u8);
-        out_u8 = vminq_u8(out_u8, max_u8);
-    }
-
-    return out_u8;
-}
-
-/** Performs final quantization step on 16 elements
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-int8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                int          result_fixedpoint_multiplier,
-                                int32_t      result_shift,
-                                int32x4_t    result_offset_after_shift_s32,
-                                int8x16_t    min_s8,
-                                int8x16_t    max_s8)
-{
-    if(result_shift < 0)
-    {
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
-        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
-        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
-
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-
-        // Round to the nearest division by a power-of-two using result_shift_s32
-        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
-        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
-        in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
-        in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
-    }
-
-    // Add the offset terms
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-/** Performs final quantization step on 16 elements for symmetric quantization
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
- *
- * @return Quantized values
- */
-template <bool   is_bounded_relu>
-inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
-                                            const int32x4x4_t &result_fixedpoint_multiplier,
-                                            const int32x4x4_t &result_shift,
-                                            const int32x4_t   &result_offset_after_shift_s32,
-                                            const int8x16_t   &min_s8,
-                                            const int8x16_t   &max_s8)
-{
-    const static int32x4_t one_s32 = vdupq_n_s32(1);
-
-    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    int32x4x4_t res_shift_gt0 =
-    {
-        vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
-        vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
-        vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
-        vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]),
-    };
-    // Round to the nearest division by a power-of-two using result_shift_s32
-    res_shift_gt0.val[0] = rounding_divide_by_pow2(res_shift_gt0.val[0], result_shift.val[0]);
-    res_shift_gt0.val[1] = rounding_divide_by_pow2(res_shift_gt0.val[1], result_shift.val[1]);
-    res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
-    res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
-
-    int32x4x4_t res_shift_lt0 =
-    {
-        vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
-        vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
-        vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
-        vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))),
-    };
-    res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]);
-    res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]);
-    res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]);
-    res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
-
-    // Select result depending on shift value
-    const uint32x4x4_t mask_lt0 =
-    {
-#ifdef __aarch64__
-        vcltzq_s32(result_shift.val[0]),
-        vcltzq_s32(result_shift.val[1]),
-        vcltzq_s32(result_shift.val[2]),
-        vcltzq_s32(result_shift.val[3]),
-#else  //__aarch64__
-        vcltq_s32(result_shift.val[0], vdupq_n_s32(0)),
-        vcltq_s32(result_shift.val[1], vdupq_n_s32(0)),
-        vcltq_s32(result_shift.val[2], vdupq_n_s32(0)),
-        vcltq_s32(result_shift.val[3], vdupq_n_s32(0)),
-#endif //__aarch64__
-    };
-
-    in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]);
-    in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]);
-    in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]);
-    in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]);
-
-    // Add the offset terms
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-/** Performs final quantization step on single element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_value                      Input to be quantized.
- * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
- * @param[in] result_shift                  Result shift parameter
- * @param[in] result_offset_after_shift_s32 Result offset parameter
- * @param[in] min_u8                        Relu lower bound
- * @param[in] max_u8                        Relu upper bound
- *
- * @return Quantized value
- */
-template <bool is_bounded_relu>
-inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8)
-{
-    int32x4_t in_s32 = vdupq_n_s32(in_value);
-
-    if(result_shift < 0)
-    {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
-        // Shift value by result_shift_s32
-        in_value = rounding_divide_by_pow2(in_value, result_shift);
-    }
-
-    // Add the offset term
-    in_value += result_offset_after_shift_s32;
-
-    // Bound the result
-    uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
-    if(is_bounded_relu)
-    {
-        out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
-    }
-
-    return out_u8;
-}
-
-/** Performs final quantization step on single element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_value                      Input to be quantized.
- * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
- * @param[in] result_shift                  Result shift parameter
- * @param[in] result_offset_after_shift_s32 Result offset parameter
- * @param[in] min_s8                        Relu lower bound
- * @param[in] max_s8                        Relu upper bound
- *
- * @return Quantized value
- */
-template <bool is_bounded_relu>
-inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8)
-{
-    int32x4_t in_s32 = vdupq_n_s32(in_value);
-
-    if(result_shift < 0)
-    {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
-
-        // Shift value by result_shift_s32
-        in_value = rounding_divide_by_pow2(in_value, result_shift);
-    }
-
-    // Add the offset term
-    in_value += result_offset_after_shift_s32;
-
-    // Bound the result
-    int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-    if(is_bounded_relu)
-    {
-        out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
-    }
-
-    return out_s8;
-}
-
-/** Dequantize a neon vector holding 8 quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a neon vector holding 8 singed quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a neon vector holding 16 quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a neon vector holding 16 signed quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
- *
- * @param[in] qv     Input values to be dequantized.
- * @param[in] scale  Quantization scaling factor.
- * @param[in] offset Zero quantization offset.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
-{
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a vector of 16 values stored as signed asymmetric.
- *
- * @param[in] qv     Input values to be dequantized.
- * @param[in] scale  Quantization scaling factor.
- * @param[in] offset Zero quantization offset.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
-{
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize following symmetric quantization scheme a neon vector holding 16 quantized values.
- *
- * @param[in] qv     Input values to be dequantized.
- * @param[in] vscale Vector containing quantization scaling factors.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
-{
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
- *
- * @param[in] qv    Input values to be dequantized.
- * @param[in] scale Quantization scaling factor.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
-{
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Quantize a neon vector holding 8 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-}
-
-/** Quantize a neon vector holding 8 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the singed quantized values
- */
-inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-}
-
-/** Quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-    return vcombine_u8(pa, pb);
-}
-
-/** Signed quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-    return vcombine_s8(pa, pb);
-}
-
-/** Quantize to QASYMM16 a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
-    const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
-    return { pa, pb };
-}
-} // namespace arm_compute
-#include "arm_compute/core/NEON/NEAsymm.inl"
-#endif // ARM_COMPUTE_NEASYMM_H
diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl
deleted file mode 100644
index 71205e0403..0000000000
--- a/arm_compute/core/NEON/NEAsymm.inl
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-namespace arm_compute
-{
-inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
-{
-    // Convert uint8 vectors to uint16 vectors
-    const uint8x8_t vd_low        = vget_low_u8(vd);
-    const uint8x8_t vd_high       = vget_high_u8(vd);
-    uint16x8_t      vd_low_u16x8  = vmovl_u8(vd_low);
-    uint16x8_t      vd_high_u16x8 = vmovl_u8(vd_high);
-    // Convert uint16 vectors to uint32 vectors
-    uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8));
-    uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8));
-    uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8));
-    uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8));
-    // Convert uint32 vectors to float32 vectors
-    float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4);
-    float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4);
-    float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4);
-    float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4);
-    // vd = vd*vs + vo
-    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
-    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
-    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
-    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
-    // Convert float32 vectors to uint32 vectors
-    A_u32x4 = vcvtq_u32_f32(A_f32x4);
-    B_u32x4 = vcvtq_u32_f32(B_f32x4);
-    C_u32x4 = vcvtq_u32_f32(C_f32x4);
-    D_u32x4 = vcvtq_u32_f32(D_f32x4);
-    // Convert uint32 vectors to uint16 vectors (with saturation)
-    vd_low_u16x8  = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
-    vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
-    // convert uint16 vectors to uint8 vectors (with saturation)
-    return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
-}
-inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
-{
-    // Convert uint8 vectors to int16 vectors
-    const int8x8_t vd_low        = vget_low_s8(vd);
-    const int8x8_t vd_high       = vget_high_s8(vd);
-    int16x8_t      vd_low_s16x8  = vmovl_s8(vd_low);
-    int16x8_t      vd_high_s16x8 = vmovl_s8(vd_high);
-    // Convert int16 vectors to int32 vectors
-    int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8));
-    int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8));
-    int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8));
-    int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8));
-    // Convert int32 vectors to float32 vectors
-    float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4);
-    float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4);
-    float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4);
-    float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4);
-    // vd = vd*vs + vo
-    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
-    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
-    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
-    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
-    // Convert float32 vectors to int32 vectors
-    A_s32x4 = vcvtq_s32_f32(A_f32x4);
-    B_s32x4 = vcvtq_s32_f32(B_f32x4);
-    C_s32x4 = vcvtq_s32_f32(C_f32x4);
-    D_s32x4 = vcvtq_s32_f32(D_f32x4);
-    // Convert int32 vectors to int16 vectors (with saturation)
-    vd_low_s16x8  = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
-    vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
-    // convert int16 vectors to int8 vectors (with saturation)
-    return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8));
-}
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
deleted file mode 100644
index 2cf52e58d2..0000000000
--- a/arm_compute/core/NEON/NEColorConvertHelper.inl
+++ /dev/null
@@ -1,1045 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IMultiImage.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/Utils.h"
-
-#include <arm_neon.h>
-
-namespace
-{
-#ifndef DOXYGEN_SKIP_THIS
-constexpr float red_coef_bt709    = 1.5748F;
-constexpr float green_coef_bt709  = -0.1873f;
-constexpr float green_coef2_bt709 = -0.4681f;
-constexpr float blue_coef_bt709   = 1.8556f;
-
-constexpr float rgb2yuv_bt709_kr = 0.2126f;
-constexpr float rgb2yuv_bt709_kb = 0.0722f;
-// K_g = 1 - K_r - K_b
-constexpr float rgb2yuv_bt709_kg = 0.7152f;
-// C_u = 1 / (2 * (1 - K_b))
-constexpr float rgb2yuv_bt709_cu = 0.5389f;
-// C_v = 1 / (2 * (1 - K_r))
-constexpr float rgb2yuv_bt709_cv = 0.6350f;
-
-constexpr float rgb2u8_red_coef   = 0.2126f;
-constexpr float rgb2u8_green_coef = 0.7152f;
-constexpr float rgb2u8_blue_coef  = 0.0722f;
-
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
-                                                const float rcoef, const float gcoef, const float bcoef)
-{
-    float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
-    greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
-    greyscale             = vmlaq_n_f32(greyscale, bcolor, bcoef);
-    return greyscale;
-}
-
-inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
-{
-    float32x4x4_t out_float32;
-
-    //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats
-    const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]);
-    const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]);
-    const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]);
-
-    //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) )
-    //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float
-    out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s
-    arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
-}
-
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
-                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
-{
-    /*
-    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
-    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
-    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
-    */
-    const auto c128 = vdupq_n_f32(128.f);
-
-    // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
-    yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
-    yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
-    yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
-
-    // U = (B - Y) / (2 * (1 - K_b))
-    uvec = vsubq_f32(bvec, yvec);
-    uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
-
-    // V = (R - Y) / (2 * (1 - K_r))
-    vvec = vsubq_f32(rvec, yvec);
-    vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
-}
-
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
-                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
-{
-    float32x4x3_t rgb1, rgb2;
-
-    // Compute: cb - 128 and cr - 128;
-    const auto c128 = vdupq_n_f32(128.f);
-    uvec_val        = vsubq_f32(uvec_val, c128);
-    vvec_val        = vsubq_f32(vvec_val, c128);
-
-    // Compute:
-    // r = 0.0000f*f_u + 1.5748f*f_v;
-    // g = 0.1873f*f_u - 0.4681f*f_v;
-    // b = 1.8556f*f_u + 0.0000f*f_v;
-    const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
-    const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
-    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
-                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
-
-    // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
-    // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
-    // and written back to memory using vst3 instruction
-
-    rgb1.val[0] = vaddq_f32(yvec_val, red);
-    rgb1.val[1] = vaddq_f32(yvec_val, green);
-    rgb1.val[2] = vaddq_f32(yvec_val, blue);
-
-    rgb2.val[0] = vaddq_f32(yyvec_val, red);
-    rgb2.val[1] = vaddq_f32(yyvec_val, green);
-    rgb2.val[2] = vaddq_f32(yyvec_val, blue);
-
-    uint8x8x3_t u8_rgb;
-    arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
-
-    if(!alpha)
-    {
-        vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
-        vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
-        vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
-        vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
-        vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
-        vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
-        vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
-        vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
-    }
-    else
-    {
-        uint8x8x4_t u8_rgba;
-        u8_rgba.val[0] = u8_rgb.val[0];
-        u8_rgba.val[1] = u8_rgb.val[1];
-        u8_rgba.val[2] = u8_rgb.val[2];
-        u8_rgba.val[3] = vdup_n_u8(255);
-        vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
-        vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
-        vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
-        vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
-        vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
-        vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
-        vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
-        vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
-    }
-}
-
-inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
-{
-    uint8x16x3_t rgb;
-
-    if(alpha)
-    {
-        const auto tmp = vld4q_u8(ptr);
-        rgb.val[0]     = tmp.val[0];
-        rgb.val[1]     = tmp.val[1];
-        rgb.val[2]     = tmp.val[2];
-    }
-    else
-    {
-        rgb = vld3q_u8(ptr);
-    }
-
-    return rgb;
-}
-
-inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
-{
-    // Convert the uint8x16_t to float32x4x4_t
-    const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]);
-    const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]);
-    const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]);
-
-    const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]);
-    const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]);
-    const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]);
-
-    float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
-    float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
-
-    for(auto i = 0; i < 4; ++i)
-    {
-        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
-                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
-        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
-                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
-    }
-
-    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
-}
-
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
-                              unsigned char *const __restrict out_uv)
-{
-    uint8x16x3_t vec_top, vec_bottom;
-    vec_top.val[0]    = rvec_top;
-    vec_top.val[1]    = gvec_top;
-    vec_top.val[2]    = bvec_top;
-    vec_bottom.val[0] = rvec_bottom;
-    vec_bottom.val[1] = gvec_bottom;
-    vec_bottom.val[2] = bvec_bottom;
-
-    rgb_to_yuv_conversion(vec_top, vec_bottom);
-
-    vst1q_u8(out_y_top, vec_top.val[0]);
-    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
-
-    const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
-    const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
-    const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
-    const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
-
-    uint8x8x2_t uvvec;
-    uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
-    uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
-
-    vst2_u8(out_uv, uvvec);
-}
-
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
-                              unsigned char *const __restrict out_u,
-                              unsigned char *const __restrict out_v)
-{
-    uint8x16x3_t vec_top, vec_bottom;
-    vec_top.val[0]    = rvec_top;
-    vec_top.val[1]    = gvec_top;
-    vec_top.val[2]    = bvec_top;
-    vec_bottom.val[0] = rvec_bottom;
-    vec_bottom.val[1] = gvec_bottom;
-    vec_bottom.val[2] = bvec_bottom;
-
-    rgb_to_yuv_conversion(vec_top, vec_bottom);
-
-    vst1q_u8(out_y_top, vec_top.val[0]);
-    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
-
-    const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
-    const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
-    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
-                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
-
-    vst1_u8(out_u, vget_low_u8(uvvec));
-    vst1_u8(out_v, vget_high_u8(uvvec));
-}
-
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
-                              unsigned char *const __restrict out_y,
-                              unsigned char *const __restrict out_u,
-                              unsigned char *const __restrict out_v)
-{
-    // Convert the uint8x16_t to float32x4x4_t
-    const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec);
-    const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec);
-    const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
-
-    float32x4x4_t fyvec, fuvec, fvvec;
-    for(auto i = 0; i < 4; ++i)
-    {
-        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
-                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
-    }
-
-    uint8x16_t yvec, uvec, vvec;
-    arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec);
-    arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec);
-    arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec);
-
-    vst1q_u8(out_y, yvec);
-    vst1q_u8(out_u, uvec);
-    vst1q_u8(out_v, vvec);
-}
-#endif /* DOXYGEN_SKIP_THIS */
-}
-
-namespace arm_compute
-{
-/** Convert RGB to RGBX.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output RGBX buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld3q_u8(in.ptr());
-        uint8x16x4_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        ta2.val[3] = vdupq_n_u8(255);
-        vst4q_u8(out.ptr(), ta2);
-    },
-    in, out);
-}
-
-/** Convert RGB to U8.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output U8 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta1 = vld3q_u8(in.ptr());
-        uint8x16_t ta2;
-        rgb_to_u8_conversion(ta1, ta2);
-        vst1q_u8(out.ptr(), ta2);
-    },
-    in, out);
-}
-
-/** Convert RGBX to RGB.
- *
- * @param[in]  input  Input RGBX data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld4q_u8(in.ptr());
-        uint8x16x3_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        vst3q_u8(out.ptr(), ta2);
-    },
-    in, out);
-}
-
-/** Convert YUYV to RGB.
- *
- * @param[in]  input  Input YUYV data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool yuyv, bool alpha>
-void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    constexpr auto element_size = alpha ? 32 : 24;
-    constexpr auto shift        = yuyv ? 0 : 1;
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta = vld4q_u8(in.ptr());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
-        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
-        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
-        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
-
-        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-    },
-    in, out);
-}
-
-/** Convert NV12 to RGB.
- *
- * @param[in]  input  Input NV12 data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool uv, bool alpha>
-void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    constexpr auto element_size = alpha ? 32 : 24;
-    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
-    constexpr auto shift        = uv ? 0 : 1;
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_uv(input_ptr->plane(1), win_uv);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
-
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_uv, out);
-}
-
-/** Convert IYUV to RGB.
- *
- * @param[in]  input  Input IYUV data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    constexpr auto element_size = alpha ? 32 : 24;
-    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_u(input_ptr->plane(1), win_uv);
-    Iterator in_v(input_ptr->plane(2), win_uv);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto *y_top_ptr    = in_y.ptr();
-        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
-        const auto *u_ptr        = in_u.ptr();
-        const auto *v_ptr        = in_v.ptr();
-
-        // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
-#if defined(__arch64__)
-        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
-        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
-        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
-        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
-        const auto ta_u         = vld1q_u8(u_ptr);
-        const auto ta_v         = vld1q_u8(v_ptr);
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
-#else  /* defined(__arch64__) */
-        const auto ta_y_top    = vld2q_u8(y_top_ptr);
-        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
-        const auto ta_u        = vld1q_u8(u_ptr);
-        const auto ta_v        = vld1q_u8(v_ptr);
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u.val[0] = U0 U2 U4 U6 ...
-        //ta_v.val[0] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
-#endif /* defined(__arch64__) */
-
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_u, in_v, out);
-}
-
-/** Convert YUYV to NV12.
- *
- * @param[in]  input  Input YUYV data buffer.
- * @param[out] output Output NV12 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool yuyv>
-void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = yuyv ? 0 : 1;
-
-    // NV12's UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_uv(output_ptr->plane(1), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16x2_t uvvec;
-        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst2q_u8(out_uv.ptr(), uvvec);
-    },
-    in, out_y, out_uv);
-}
-
-/** Convert IYUV to NV12.
- *
- * @param[in]  input  Input IYUV data buffer.
- * @param[out] output Output NV12 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_u(input_ptr->plane(1), win_uv);
-    Iterator in_v(input_ptr->plane(2), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_uv(output_ptr->plane(1), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        uint8x16x2_t ta_uv;
-        ta_uv.val[0] = vld1q_u8(in_u.ptr());
-        ta_uv.val[1] = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst2q_u8(out_uv.ptr(), ta_uv);
-    },
-    in_y, in_u, in_v, out_y, out_uv);
-}
-
-/** Convert NV12 to IYUV.
- *
- * @param[in]  input  Input NV12 data buffer.
- * @param[out] output Output IYUV buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool uv>
-void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = uv ? 0 : 1;
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_uv(input_ptr->plane(1), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win_uv);
-    Iterator out_v(output_ptr->plane(2), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
-        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
-}
-
-/** Convert YUYV to IYUV.
- *
- * @param[in]  input  Input YUYV data buffer.
- * @param[out] output Output IYUV buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool yuyv>
-void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = yuyv ? 0 : 1;
-
-    // Destination's UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win_uv);
-    Iterator out_v(output_ptr->plane(2), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16_t uvec;
-        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        vst1q_u8(out_u.ptr(), uvec);
-
-        uint8x16_t vvec;
-        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst1q_u8(out_v.ptr(), vvec);
-    },
-    in, out_y, out_u, out_v);
-}
-
-/** Convert NV12 to YUV4.
- *
- * @param[in]  input  Input NV12 data buffer.
- * @param[out] output Output YUV4 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool uv>
-void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = uv ? 0 : 1;
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_uv(input_ptr->plane(1), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win);
-    Iterator out_v(output_ptr->plane(2), win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_uv.val[0 + shift];
-        uvec.val[1] = ta_uv.val[0 + shift];
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_uv.val[1 - shift];
-        vvec.val[1] = ta_uv.val[1 - shift];
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
-}
-
-/** Convert IYUV to YUV4.
- *
- * @param[in]  input  Input IYUV data buffer.
- * @param[out] output Output YUV4 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_u(input_ptr->plane(1), win_uv);
-    Iterator in_v(input_ptr->plane(2), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win);
-    Iterator out_v(output_ptr->plane(2), win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_u        = vld1q_u8(in_u.ptr());
-        const auto ta_v        = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u = U0 U2 U4 U6 ...
-        //ta_v = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_u;
-        uvec.val[1] = ta_u;
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_v;
-        vvec.val[1] = ta_v;
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_u, in_v, out_y, out_u, out_v);
-}
-
-/** Convert RGB to NV12.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output NV12 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_uv(output_ptr->plane(1), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_uv.ptr());
-    },
-    in, out_y, out_uv);
-}
-
-/** Convert RGB to IYUV.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output IYUV buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win_uv);
-    Iterator out_v(output_ptr->plane(2), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
-}
-
-/** Convert RGB to YUV4.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output YUV4 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win);
-    Iterator out_v(output_ptr->plane(2), win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb = load_rgb(in.ptr(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
-                          out_y.ptr(), out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
-}
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
deleted file mode 100644
index 14e51d825c..0000000000
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <array>
-#include <limits>
-
-namespace arm_compute
-{
-#ifndef DOXYGEN_SKIP_THIS
-
-inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
-{
-    float32x4x2_t res =
-    {
-        {
-            vmaxq_f32(a.val[0], b.val[0]),
-            vmaxq_f32(a.val[1], b.val[1])
-        }
-    };
-    return res;
-}
-#endif /* DOXYGEN_SKIP_THIS */
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
deleted file mode 100644
index 38701f434a..0000000000
--- a/arm_compute/core/NEON/NEKernels.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEKERNELS_H
-#define ARM_COMPUTE_NEKERNELS_H
-
-/* Header regrouping all the NEON kernels */
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NECropKernel.h"
-#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
-#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
-
-#endif /* ARM_COMPUTE_NEKERNELS_H */
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
deleted file mode 100644
index 8827bbf459..0000000000
--- a/arm_compute/core/NEON/NEMath.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMATH_H
-#define ARM_COMPUTE_NEMATH_H
-
-#include <arm_neon.h>
-#include <array>
-
-namespace arm_compute
-{
-/** Calculate floor of a vector.
- *
- * @param[in] val Input vector value in F32 format.
- *
- * @return The calculated floor vector.
- */
-float32x4_t vfloorq_f32(float32x4_t val);
-
-/** Calculate round value of a vector to nearest with ties to even.
- *
- * @param[in] val Input vector value in F32 format.
- *
- * @return The calculated round vector.
- */
-float32x4_t vroundq_rte_f32(float32x4_t val);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float32x2_t vinvsqrt_f32(float32x2_t x);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float32x4_t vinvsqrtq_f32(float32x4_t x);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float32x2_t vinv_f32(float32x2_t x);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float32x4_t vinvq_f32(float32x4_t x);
-
-/** Perform a 7th degree polynomial approximation using Estrin's method.
- *
- * @param[in] x      Input vector value in F32 format.
- * @param[in] coeffs Polynomial coefficients table.
- *
- * @return The calculated approximation.
- */
-float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs);
-
-/** Calculate exponential
- *
- * @param[in] x Input vector value in F32 format.
- *
- * @return The calculated exponent.
- */
-float32x4_t vexpq_f32(float32x4_t x);
-
-/** Calculate logarithm
- *
- * @param[in] x Input vector value in F32 format.
- *
- * @return The calculated logarithm.
- */
-float32x4_t vlogq_f32(float32x4_t x);
-
-/** Calculate hyperbolic tangent.
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @note We clamp x to [-5,5] to avoid overflowing issues.
- *
- * @param[in] val Input vector value in F32 format.
- *
- * @return The calculated Hyperbolic Tangent.
- */
-float32x4_t vtanhq_f32(float32x4_t val);
-
-/** Calculate n power of a number.
- *
- * pow(x,n) = e^(n*log(x))
- *
- * @param[in] val Input vector value in F32 format.
- * @param[in] n   Powers to raise the input to.
- *
- * @return The calculated power.
- */
-float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
-
-/** Round to the nearest division by a power-of-two using exponent
- *
- * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
- *
- * @param[in] x        Vector of 4 elements
- * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two
- *
- * @return the nearest division by a power-of-two using exponent
- */
-int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent);
-
-/** Round to the nearest division by a power-of-two using exponent
- *
- * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
- *
- * @param[in] x        Vector of 4 elements
- * @param[in] exponent Integer value used to round to nearest division by a power-of-two
- *
- * @return the nearest division by a power-of-two using exponent
- */
-int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent);
-
-/** Round to the nearest division by a power-of-two using exponent
- *
- * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
- *
- * @param[in] x        Element to divide.
- * @param[in] exponent Integer value used to round to nearest division by a power-of-two
- *
- * @return the nearest division by a power-of-two using exponent
- */
-int32_t rounding_divide_by_pow2(int32_t x, int exponent);
-
-/** Converts from uint8x16 to float32x4x4_t
- *
- * @param[in] in Vector of uint8 to be converted
- *
- * @return Converted vector of float
- */
-float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in);
-
-/** Converts from int8x16 to float32x4x4_t
- *
- * @param[in] in Vector of int8 to be converted
- *
- * @return Converted vector of float
- */
-float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in);
-
-/** Converts to float32x4x4_t from the specified templated 16 elements vectors
- *
- * @param[in] in Vector of float to be converted
- *
- * @return Converted vector of float
- */
-template <typename T>
-float32x4x4_t convert_to_float32x4x4(const T &in);
-
-/** Converts from two float32x4x3_t to just one uint8x8x3_t
- *
- * @param[in]  in1 First input vector of float to be converted
- * @param[in]  in2 Second input vector of float to be converted
- * @param[out] out Converted output vector uint8 to store the result
- */
-void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out);
-
-/** Converts from two float32x4x4_t to just one uint8x16_t
- *
- * @param[in]  in  Vector of float to be converted
- * @param[out] out Converted vector of uint8 to store the result
- */
-void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out);
-
-/** Converts from float32x4x4_t to just one int8x16_t
- *
- * @param[in]  in  Vector of float to be converted
- * @param[out] out Converted vector of uint8 to store the result
- */
-void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out);
-
-/** Calculate sine.
- *
- * @param[in] val Input vector value in radians, F32 format.
- *
- * @return The calculated sine.
- */
-float32x4_t vsinq_f32(float32x4_t val);
-
-/** Calculate sine.
- *
- * @param[in] val Input vector value in radians, F32 format.
- *
- * @return The calculated sine.
- */
-float32x2_t vsin_f32(float32x2_t val);
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Calculate hyperbolic tangent.
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @note We clamp x to [-5,5] to avoid overflowing issues.
- *
- * @param[in] val Input vector value in F16 format.
- *
- * @return The calculated Hyperbolic Tangent.
- */
-float16x8_t vtanhq_f16(float16x8_t val);
-
-/** Calculate round value of a vector to nearest with ties to even.
- *
- * @param[in] val Input vector value in F16 format.
- *
- * @return The calculated round vector.
- */
-float16x8_t vroundq_rte_f16(float16x8_t val);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float16x4_t vinv_f16(float16x4_t x);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float16x8_t vinvq_f16(float16x8_t x);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float16x4_t vinvsqrt_f16(float16x4_t x);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float16x8_t vinvsqrtq_f16(float16x8_t x);
-
-/** Calculate exponential
- *
- * @param[in] x Input vector value in F16 format.
- *
- * @return The calculated exponent.
- */
-float16x8_t vexpq_f16(float16x8_t x);
-
-/** Calculate n power of a number.
- *
- * pow(x,n) = e^(n*log(x))
- *
- * @param[in] val Input vector value in F16 format.
- * @param[in] n   Powers to raise the input to.
- *
- * @return The calculated power.
- */
-float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
-
-/** Calculate sine.
- *
- * @param[in] val Input vector value in radians, F16 format.
- *
- * @return The calculated sine.
- */
-float16x8_t vsinq_f16(float16x8_t val);
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#include "arm_compute/core/NEON/NEMath.inl"
-#endif /* ARM_COMPUTE_NEMATH_H */
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
deleted file mode 100644
index 032bfde238..0000000000
--- a/arm_compute/core/NEON/NEMath.inl
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <cmath>
-#include <limits>
-
-#ifndef M_PI
-#define M_PI (3.14159265358979323846)
-#endif // M_PI
-
-namespace arm_compute
-{
-/** Exponent polynomial coefficients */
-const std::array<float32x4_t, 8> exp_tab =
-{
-    {
-        vdupq_n_f32(1.f),
-        vdupq_n_f32(0.0416598916054f),
-        vdupq_n_f32(0.500000596046f),
-        vdupq_n_f32(0.0014122662833f),
-        vdupq_n_f32(1.00000011921f),
-        vdupq_n_f32(0.00833693705499f),
-        vdupq_n_f32(0.166665703058f),
-        vdupq_n_f32(0.000195780929062f),
-    }
-};
-
-/** Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
-    {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
-    }
-};
-
-/** Sin polynomial coefficients */
-constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
-constexpr float te_sin_coeff3 = 0.05f;           // 1/(4*5)
-constexpr float te_sin_coeff4 = 0.023809523810f; // 1/(6*7)
-constexpr float te_sin_coeff5 = 0.013888888889f; // 1/(8*9)
-
-#ifndef DOXYGEN_SKIP_THIS
-inline float32x4_t vfloorq_f32(float32x4_t val)
-{
-    static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
-
-    const int32x4_t   z = vcvtq_s32_f32(val);
-    const float32x4_t r = vcvtq_f32_s32(z);
-
-    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r);
-}
-
-inline float32x4_t vroundq_rte_f32(float32x4_t val)
-{
-#ifdef __aarch64__
-    return vrndnq_f32(val);
-#else  // __aarch64__
-    static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
-    static const float32x4_t CONST_1_FLOAT    = vdupq_n_f32(1.f);
-    static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
-    const float32x4_t        floor_val        = vfloorq_f32(val);
-    const float32x4_t        diff             = vsubq_f32(val, floor_val);
-
-    /*
-    * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
-    * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT))))
-    */
-
-    return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
-                     floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
-#endif // __aarch64__
-}
-
-inline float32x2_t vinvsqrt_f32(float32x2_t x)
-{
-    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
-    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
-    return sqrt_reciprocal;
-}
-
-inline float32x4_t vinvsqrtq_f32(float32x4_t x)
-{
-    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
-    return sqrt_reciprocal;
-}
-
-inline float32x2_t vinv_f32(float32x2_t x)
-{
-    float32x2_t recip = vrecpe_f32(x);
-    recip             = vmul_f32(vrecps_f32(x, recip), recip);
-    recip             = vmul_f32(vrecps_f32(x, recip), recip);
-    return recip;
-}
-
-inline float32x4_t vinvq_f32(float32x4_t x)
-{
-    float32x4_t recip = vrecpeq_f32(x);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
-    return recip;
-}
-
-inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs)
-{
-    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
-    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
-    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
-    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
-    float32x4_t x2  = vmulq_f32(x, x);
-    float32x4_t x4  = vmulq_f32(x2, x2);
-    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
-    return res;
-}
-
-inline float32x4_t vexpq_f32(float32x4_t x)
-{
-    static const float32x4_t CONST_LN2          = vdupq_n_f32(0.6931471805f); // ln(2)
-    static const float32x4_t CONST_INV_LN2      = vdupq_n_f32(1.4426950408f); // 1/ln(2)
-    static const float32x4_t CONST_INF          = vdupq_n_f32(std::numeric_limits<float>::infinity());
-    static const float32x4_t CONST_MAX_INPUT    = vdupq_n_f32(88.7f);
-    static const float32x4_t CONST_0            = vdupq_n_f32(0.f);
-    static const int32x4_t   CONST_NEGATIVE_126 = vdupq_n_s32(-126);
-
-    // Perform range reduction [-log(2),log(2)]
-    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
-    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
-
-    // Polynomial Approximation
-    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
-
-    // Reconstruct
-    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
-    poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly); // Handle underflow
-    poly = vbslq_f32(vcgtq_f32(x, CONST_MAX_INPUT), CONST_INF, poly);  // Handle overflow
-
-    return poly;
-}
-
-inline float32x4_t vlogq_f32(float32x4_t x)
-{
-    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
-    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
-
-    // Extract exponent
-    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
-    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
-
-    // Polynomial Approximation
-    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
-
-    // Reconstruct
-    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
-
-    return poly;
-}
-
-inline float32x4_t vtanhq_f32(float32x4_t val)
-{
-    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);
-    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);
-    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f);
-    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f);
-
-    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
-    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
-    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
-    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
-    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
-    return tanh;
-}
-
-inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
-{
-    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
-}
-
-inline float32x4_t vsinq_f32(float32x4_t val)
-{
-    const float32x4_t pi_v   = vdupq_n_f32(M_PI);
-    const float32x4_t pio2_v = vdupq_n_f32(M_PI / 2);
-    const float32x4_t ipi_v  = vdupq_n_f32(1 / M_PI);
-
-    //Find positive or negative
-    const int32x4_t  c_v    = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v)));
-    const uint32x4_t sign_v = vcleq_f32(val, vdupq_n_f32(0));
-    const uint32x4_t odd_v  = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1));
-
-    uint32x4_t neg_v = veorq_u32(odd_v, sign_v);
-
-    //Modulus a - (n * int(a*(1/n)))
-    float32x4_t      ma    = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v)));
-    const uint32x4_t reb_v = vcgeq_f32(ma, pio2_v);
-
-    //Rebase a between 0 and pi/2
-    ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma);
-
-    //Taylor series
-    const float32x4_t ma2 = vmulq_f32(ma, ma);
-
-    //2nd elem: x^3 / 3!
-    float32x4_t elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(te_sin_coeff2));
-    float32x4_t res  = vsubq_f32(ma, elem);
-
-    //3rd elem: x^5 / 5!
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff3));
-    res  = vaddq_f32(res, elem);
-
-    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff4));
-    res  = vsubq_f32(res, elem);
-
-    //5th elem: x^9 / 9!
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff5));
-    res  = vaddq_f32(res, elem);
-
-    //Change of sign
-    neg_v = vshlq_n_u32(neg_v, 31);
-    res   = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v));
-    return res;
-}
-
-inline float32x2_t vsin_f32(float32x2_t val)
-{
-    const float32x2_t pi_v   = vdup_n_f32(M_PI);
-    const float32x2_t pio2_v = vdup_n_f32(M_PI / 2);
-    const float32x2_t ipi_v  = vdup_n_f32(1 / M_PI);
-
-    //Find positive or negative
-    const int32x2_t  c_v    = vabs_s32(vcvt_s32_f32(vmul_f32(val, ipi_v)));
-    const uint32x2_t sign_v = vcle_f32(val, vdup_n_f32(0));
-    const uint32x2_t odd_v  = vand_u32(vreinterpret_u32_s32(c_v), vdup_n_u32(1));
-
-    uint32x2_t neg_v = veor_u32(odd_v, sign_v);
-
-    //Modulus a - (n * int(a*(1/n)))
-    float32x2_t      ma    = vsub_f32(vabs_f32(val), vmul_f32(pi_v, vcvt_f32_s32(c_v)));
-    const uint32x2_t reb_v = vcge_f32(ma, pio2_v);
-
-    //Rebase a between 0 and pi/2
-    ma = vbsl_f32(reb_v, vsub_f32(pi_v, ma), ma);
-
-    //Taylor series
-    const float32x2_t ma2 = vmul_f32(ma, ma);
-
-    //2nd elem: x^3 / 3!
-    float32x2_t elem = vmul_f32(vmul_f32(ma, ma2), vdup_n_f32(te_sin_coeff2));
-    float32x2_t res  = vsub_f32(ma, elem);
-
-    //3rd elem: x^5 / 5!
-    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff3));
-    res  = vadd_f32(res, elem);
-
-    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
-    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff4));
-    res  = vsub_f32(res, elem);
-
-    //5th elem: x^9 / 9!
-    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff5));
-    res  = vadd_f32(res, elem);
-
-    //Change of sign
-    neg_v = vshl_n_u32(neg_v, 31);
-    res   = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(res), neg_v));
-    return res;
-}
-
-#endif /* DOXYGEN_SKIP_THIS */
-
-inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent)
-{
-    const int32x4_t shift_vec  = vnegq_s32(exponent);
-    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
-    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
-    return vrshlq_s32(fixed_up_x, shift_vec);
-}
-
-inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
-{
-    const int32x4_t shift_vec  = vdupq_n_s32(-exponent);
-    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
-    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
-    return vrshlq_s32(fixed_up_x, shift_vec);
-}
-
-inline int32_t rounding_divide_by_pow2(int32_t x, int exponent)
-{
-    const int32_t mask      = (1 << exponent) - 1;
-    const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
-    return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
-}
-
-inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
-{
-    float32x4x4_t out;
-
-    const auto tmp1 = vmovl_u8(vget_low_u8(in));
-    out.val[0]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
-    out.val[1]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
-
-    const auto tmp2 = vmovl_u8(vget_high_u8(in));
-    out.val[2]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
-    out.val[3]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
-    return out;
-}
-
-inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in)
-{
-    float32x4x4_t out;
-
-    const auto tmp1 = vmovl_s8(vget_low_s8(in));
-    out.val[0]      = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1)));
-    out.val[1]      = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1)));
-
-    const auto tmp2 = vmovl_s8(vget_high_s8(in));
-    out.val[2]      = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2)));
-    out.val[3]      = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2)));
-    return out;
-}
-
-template <>
-inline float32x4x4_t convert_to_float32x4x4(const uint8x16_t &in)
-{
-    return convert_uint8x16_to_float32x4x4(in);
-}
-
-template <>
-inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in)
-{
-    return convert_int8x16_to_float32x4x4(in);
-}
-
-inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
-{
-    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
-    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
-    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
-}
-
-inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
-{
-    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
-                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
-    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
-    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
-}
-
-inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
-{
-    const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
-                                  vqmovn_s32(vcvtq_s32_f32(in.val[1])));
-    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[3])));
-    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Exponent polynomial coefficients */
-/** Logarithm polynomial coefficients */
-#ifndef DOXYGEN_SKIP_THIS
-inline float16x8_t vfloorq_f16(float16x8_t val)
-{
-    static const float16x8_t CONST_1 = vdupq_n_f16(1.f);
-
-    const int16x8_t   z = vcvtq_s16_f16(val);
-    const float16x8_t r = vcvtq_f16_s16(z);
-
-    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, CONST_1), r);
-}
-
-inline float16x8_t vroundq_rte_f16(float16x8_t val)
-{
-    return vrndnq_f16(val);
-}
-
-inline float16x4_t vinvsqrt_f16(float16x4_t x)
-{
-    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
-    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    return sqrt_reciprocal;
-}
-
-inline float16x8_t vinvsqrtq_f16(float16x8_t x)
-{
-    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    return sqrt_reciprocal;
-}
-
-inline float16x4_t vinv_f16(float16x4_t x)
-{
-    float16x4_t recip = vrecpe_f16(x);
-    recip             = vmul_f16(vrecps_f16(x, recip), recip);
-    recip             = vmul_f16(vrecps_f16(x, recip), recip);
-    return recip;
-}
-
-inline float16x8_t vinvq_f16(float16x8_t x)
-{
-    float16x8_t recip = vrecpeq_f16(x);
-    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
-    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
-    return recip;
-}
-
-inline float16x8_t vtanhq_f16(float16x8_t val)
-{
-    const float16x8_t CONST_1        = vdupq_n_f16(1.f);
-    const float16x8_t CONST_2        = vdupq_n_f16(2.f);
-    const float16x8_t CONST_MIN_TANH = vdupq_n_f16(-10.f);
-    const float16x8_t CONST_MAX_TANH = vdupq_n_f16(10.f);
-
-    const float16x8_t x     = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH);
-    const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x));
-    const float16x8_t num   = vsubq_f16(exp2x, CONST_1);
-    const float16x8_t den   = vaddq_f16(exp2x, CONST_1);
-    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
-    return tanh;
-}
-
-inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t, 8> &coeffs)
-{
-    const float16x8_t A   = vaddq_f16(coeffs[0], vmulq_f16(coeffs[4], x));
-    const float16x8_t B   = vaddq_f16(coeffs[2], vmulq_f16(coeffs[6], x));
-    const float16x8_t C   = vaddq_f16(coeffs[1], vmulq_f16(coeffs[5], x));
-    const float16x8_t D   = vaddq_f16(coeffs[3], vmulq_f16(coeffs[7], x));
-    const float16x8_t x2  = vmulq_f16(x, x);
-    const float16x8_t x4  = vmulq_f16(x2, x2);
-    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
-    return res;
-}
-
-inline float16x8_t vexpq_f16(float16x8_t x)
-{
-    // TODO (COMPMID-1535) : Revisit FP16 approximations
-    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
-    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
-
-    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vexpq_f32(x_low)), vcvt_f16_f32(vexpq_f32(x_high)));
-    return res;
-}
-
-inline float16x8_t vlogq_f16(float16x8_t x)
-{
-    // TODO (COMPMID-1535) : Revisit FP16 approximations
-    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
-    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
-
-    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vlogq_f32(x_low)), vcvt_f16_f32(vlogq_f32(x_high)));
-    return res;
-}
-
-inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
-{
-    // TODO (giaiod01) - COMPMID-1535
-    float32x4_t n0_f32   = vcvt_f32_f16(vget_low_f16(n));
-    float32x4_t n1_f32   = vcvt_f32_f16(vget_high_f16(n));
-    float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
-    float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
-
-    float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
-    float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
-
-    return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
-}
-
-inline float16x8_t vsinq_f16(float16x8_t val)
-{
-    const float32x4_t val_high = vcvt_f32_f16(vget_high_f16(val));
-    const float32x4_t val_low  = vcvt_f32_f16(vget_low_f16(val));
-
-    const float32x4_t res_high = vsinq_f32(val_high);
-    const float32x4_t res_low  = vsinq_f32(val_low);
-
-    return vcombine_f16(vcvt_f16_f32(res_low), vcvt_f16_f32(res_high));
-}
-
-inline float16x4_t vsin_f16(float16x4_t val)
-{
-    const float32x4_t val_f32  = vcvt_f32_f16(val);
-    const float32x2_t val_high = vget_high_f32(val_f32);
-    const float32x2_t val_low  = vget_low_f32(val_f32);
-
-    const float32x2_t res_high = vsin_f32(val_high);
-    const float32x2_t res_low  = vsin_f32(val_low);
-
-    return vcvt_f16_f32(vcombine_f32(res_low, res_high));
-}
-
-#endif /* DOXYGEN_SKIP_THIS */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h
deleted file mode 100644
index d6c5a7073a..0000000000
--- a/arm_compute/core/NEON/NESymm.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESYMM_H
-#define ARM_COMPUTE_NESYMM_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-using qsymm8_t  = int8_t;  /**< 8 bit quantized symmetric scalar value */
-using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */
-
-using qsymm16x8_t   = int16x8_t;   /**< 16 bit quantized symmetric vector with 8 elements */
-using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 16 elements */
-
-/** Performs final quantization step on 8 signed 16-bit elements
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_s32                       Input to be quantized.
- * @param[in] result_fixedpoint_multiplier Result multiplier parameter
- * @param[in] result_shift                 Result shift parameter
- * @param[in] min_s16                      Relu lower bound
- * @param[in] max_s16                      Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
-                                      int          result_fixedpoint_multiplier,
-                                      int32_t      result_shift,
-                                      int16x8_t    min_s16,
-                                      int16x8_t    max_s16)
-{
-    if(result_shift < 0)
-    {
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
-
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        // Round to the nearest division by a power-of-two using result_shift_s32
-        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
-        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
-    }
-
-    // Convert S32 to S16
-    int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s16 = vmaxq_s16(out_s16, min_s16);
-        out_s16 = vminq_s16(out_s16, max_s16);
-    }
-
-    return out_s16;
-}
-
-/** Performs final quantization step on single signed 16-bit element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_value                     Input to be quantized.
- * @param[in] result_fixedpoint_multiplier Result multiplier parameter
- * @param[in] result_shift                 Result shift parameter
- * @param[in] min_s16                      Relu lower bound
- * @param[in] max_s16                      Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
-                                           int32_t result_shift, int16_t min_s16, int16_t max_s16)
-{
-    if(result_shift < 0)
-    {
-        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        in_value            = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        const int64_t in_64 = static_cast<int64_t>(in_value) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        // Shift value by result_shift_s32
-        in_value = rounding_divide_by_pow2(static_cast<int32_t>((in_64 + (1 << 30)) >> 31), result_shift);
-    }
-
-    // Bound the result
-    int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
-
-    if(is_bounded_relu)
-    {
-        out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
-    }
-
-    return out_s16;
-}
-
-/** Dequantize a neon vector holding 8 16-bit quantized values.
- *
- * @param[in] qv    Input values to be dequantized.
- * @param[in] scale Quantization scale
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
-{
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Quantize a neon vector holding 8 floating point values.
- *
- * @param[in] qv    Input values to be quantized.
- * @param[in] scale Quantization scale
- *
- * @return A neon vector holding the quantized values
- */
-inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
-{
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-
-    const int32x4x2_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
-#else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
-#endif //__aarch64__
-        }
-    };
-    return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-}
-
-/** Dequantize a neon vector holding 16 16-bit quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale  = qi.scale;
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float scale = qi.scale;
-    ARM_COMPUTE_ERROR_ON(scale == 0.f);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const qsymm16x8x2_t res =
-    {
-        vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
-        vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
-    };
-
-    return res;
-}
-
-/** Multiply a neon vector using quantized multiplier and shift
- *
- * @param[in] input Input vector to mutiply values to be quantized.
- * @param[in] qmul  Quantized multipler
- * @param[in] shift Left bit shift
- *
- * @return A neon vector holding the multiplied value
- */
-inline int32x4x2_t multiply_by_quantized_multiplier_2row(int32x4x2_t input, int32_t qmul, int32_t shift)
-{
-    const auto left_shift  = shift > 0 ? shift : 0;
-    const auto right_shift = shift > 0 ? 0 : -shift;
-    const auto one_shifted = 1 << left_shift;
-
-    int32x4x2_t result;
-    result.val[0] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift);
-    result.val[1] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift);
-
-    return result;
-}
-
-} // namespace arm_compute
-#endif // ARM_COMPUTE_NESYMM_H
diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
deleted file mode 100644
index 7d35e40284..0000000000
--- a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the absolute difference kernel
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class NEAbsoluteDifferenceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAbsoluteDifferenceKernel";
-    }
-    /** Default constructor */
-    NEAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~NEAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output tensors
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16
-     * @param[out] output Destination tensor, Data types supported: U8/S16
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised absolute difference functions
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
-     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
-     * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
-
-    /** Absolute difference function to use for the particular tensor formats passed to configure() */
-    AbsDiffFunction *_func;
-    const ITensor   *_input1;
-    const ITensor   *_input2;
-    ITensor         *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
deleted file mode 100644
index 367385dd7a..0000000000
--- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEACCUMULATEKERNEL_H
-#define ARM_COMPUTE_NEACCUMULATEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the accumulate kernel
- *
- * Accumulation is computed by:
- * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
- */
-class NEAccumulateKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateKernel";
-    }
-    /** Set the input and accumulation tensors
-     *
-     * @param[in]  input Source tensor. Data type supported: U8.
-     * @param[out] accum Destination tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-
-/** Interface for the accumulate weighted kernel
- *
- * Weighted accumulation is computed:
- * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
- *
- * Where @f$ 0 \le \alpha \le 1 @f$
- * Conceptually, the rounding for this is defined as:
- * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
-*/
-class NEAccumulateWeightedKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateWeightedKernel";
-    }
-    /** Default constructor */
-    NEAccumulateWeightedKernel();
-    /** Set the input and accumulation tensors, and the scale value
-     *
-     * @param[in]     input Source tensor. Data type supported: U8.
-     * @param[in]     alpha Scalar value in the range [0.0f, 1.0f]
-     * @param[in,out] accum Accumulated tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input, float alpha, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    float _alpha;
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Interface for the accumulate weighted kernel using F16 */
-class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateWeightedFP16Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** Interface for the accumulate weighted kernel using F16 */
-using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-/** Interface for the accumulate squared kernel
- *
- * The accumulation of squares is computed:
- * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
- *
- * Where @f$ 0 \le shift \le 15 @f$
-*/
-class NEAccumulateSquaredKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateSquaredKernel";
-    }
-    /** Default constructor */
-    NEAccumulateSquaredKernel();
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data type supported: U8.
-     * @param[in]     shift Shift value in the range of [0, 15]
-     * @param[in,out] accum Accumulated tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, uint32_t shift, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    uint32_t _shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEACCUMULATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
deleted file mode 100644
index 82103b988b..0000000000
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_fp16.h>
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the activation layer kernel. */
-class NEActivationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEActivationLayerKernel";
-    }
-    /** Constructor */
-    NEActivationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEActivationLayerKernel(const NEActivationLayerKernel &) = delete;
-    /** Default move constructor */
-    NEActivationLayerKernel(NEActivationLayerKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete;
-    /** Default move assignment operator */
-    NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
-     * @param[in]      activation_info Activation layer information.
-     */
-    void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[in] output   Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info Activation layer information.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using ActivationFunction = ActivationLayerInfo::ActivationFunction;
-    /** Common signature for all the specialised @ref NEActivationLayerKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-    activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type activation(const Window &window);
-
-private:
-    ITensor                      *_input;
-    ITensor                      *_output;
-    ActivationFunctionExecutorPtr _func;
-    ActivationLayerInfo           _act_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
deleted file mode 100644
index 36d257b886..0000000000
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
-#define ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform addition between two tensors */
-class NEArithmeticAdditionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEArithmeticAdditionKernel";
-    }
-    /** Default constructor */
-    NEArithmeticAdditionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticAdditionKernel(const NEArithmeticAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticAdditionKernel &operator=(const NEArithmeticAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEArithmeticAdditionKernel(NEArithmeticAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEArithmeticAdditionKernel &operator=(NEArithmeticAdditionKernel &&) = default;
-    /** Default destructor */
-    ~NEArithmeticAdditionKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  policy Overflow policy.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
-     *
-     * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] policy Overflow policy.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised add functions
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
-     * @param[in]  policy Overflow policy.
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window);
-    /** Add function to use for the particular tensor types passed to configure() */
-    AddFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
deleted file mode 100644
index f75c6bfb98..0000000000
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
-#define ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform subtraction between two tensors */
-class NEArithmeticSubtractionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEArithmeticSubtractionKernel";
-    }
-    /** Default constructor */
-    NEArithmeticSubtractionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default;
-    /** Default destructor */
-    ~NEArithmeticSubtractionKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                          -> U8
-     *   - (U8,U8)                          -> S16
-     *   - (QASYMM8, QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (S16,U8)                         -> S16
-     *   - (U8,S16)                         -> S16
-     *   - (S16,S16)                        -> S16
-     *   - (F16,F16)                        -> F16
-     *   - (F32,F32)                        -> F32
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
-     * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
-     *
-     * @note Convert policy cannot be WRAP if datatype is QASYMM8
-     *
-     * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
-     * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised sub functions
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
-     * @param[in]  window Region on which to execute the kernel.
-     * @param[in]  is_sat Flag to indicate if the policy is SATURATE.
-     */
-    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window, bool is_sat);
-    /** Sub function to use for the particular tensor types passed to configure() */
-    SubFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
deleted file mode 100644
index f943744ba0..0000000000
--- a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEBatchConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBatchConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEBatchConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchConcatenateLayerKernel(const NEBatchConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchConcatenateLayerKernel &operator=(const NEBatchConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBatchConcatenateLayerKernel(NEBatchConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBatchConcatenateLayerKernel &operator=(NEBatchConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEBatchConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ITensor *input, unsigned int batch_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEBatchConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using BatchConcatFunction = void(const ITensor *in, ITensor *out, unsigned int batch_offset, const Window &window);
-
-private:
-    BatchConcatFunction *_func;
-    const ITensor       *_input;
-    ITensor             *_output;
-    unsigned int         _batch_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
deleted file mode 100644
index d59ed7baf0..0000000000
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the batch normalization layer kernel.
- */
-class NEBatchNormalizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBatchNormalizationLayerKernel";
-    }
-    /** Default constructor */
-    NEBatchNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEBatchNormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                          The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]      gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
-                   ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Configure execution function in case of non-fused activation **/
-    void configure_non_fused();
-    /** Configure execution function in case of fused activation **/
-    void configure_fused();
-
-    /** Template function to run batch normalization on fp16
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nchw(const Window &window);
-    /** Template function to run batch normalization on fp16 on tensors with NHWC format
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nhwc(const Window &window);
-    /** Template function to run batch normalization on fp32
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nchw(const Window &window);
-    /** Template function to run batch normalization on fp32 on tensors with NHWC format
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nhwc(const Window &window);
-    /** Common signature for all the batch normalization functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using BatchNormFunctionPtr = void (NEBatchNormalizationLayerKernel::*)(const Window &window);
-
-private:
-    BatchNormFunctionPtr _func;
-    ITensor             *_input;
-    ITensor             *_output;
-    const ITensor       *_mean;
-    const ITensor       *_var;
-    const ITensor       *_gamma;
-    const ITensor       *_beta;
-    float                _epsilon;
-    ActivationLayerInfo  _act_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
deleted file mode 100644
index 61e47b0ea4..0000000000
--- a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the batch to space kernel */
-class NEBatchToSpaceLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBatchToSpaceLayerKernel";
-    }
-    /** Default constructor */
-    NEBatchToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchToSpaceLayerKernel(const NEBatchToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchToSpaceLayerKernel &operator=(const NEBatchToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBatchToSpaceLayerKernel(NEBatchToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBatchToSpaceLayerKernel &operator=(NEBatchToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEBatchToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *block_shape, ITensor *output);
-    /** Initialise the kernel's inputs and output (Static block shape).
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel (Static block shape).
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    const ITensor *_block_shape; /**< Block shape tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    DataLayout     _data_layout; /**< Data layout to  be used at run-time */
-
-    int32_t _block_shape_x;
-    int32_t _block_shape_y;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
deleted file mode 100644
index 7a777678dc..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISEANDKERNEL_H
-#define ARM_COMPUTE_NEBITWISEANDKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise AND between XY-planes of two tensors
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
- */
-class NEBitwiseAndKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseAndKernel";
-    }
-    /** Default constructor */
-    NEBitwiseAndKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseAndKernel(const NEBitwiseAndKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseAndKernel &operator=(const NEBitwiseAndKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input1 An input tensor. Data type supported: U8.
-     * @param[in]  input2 An input tensor. Data type supported: U8
-     * @param[out] output Output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input1; /**< Source tensor 1 */
-    const ITensor *_input2; /**< Source tensor 2 */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISEANDKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
deleted file mode 100644
index 3fb8c083a8..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISENOTKERNEL_H
-#define ARM_COMPUTE_NEBITWISENOTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise NOT operation
- *
- * Result is computed by:
- * @f[ output(x,y) = \lnot input(x,y) @f]
- */
-class NEBitwiseNotKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseNotKernel";
-    }
-    /** Default constructor */
-    NEBitwiseNotKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseNotKernel(const NEBitwiseNotKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseNotKernel &operator=(const NEBitwiseNotKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default;
-    /** Initialise the kernel's input and output
-     *
-     * @param[in]  input  An input tensor. Data type supported: U8.
-     * @param[out] output The output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;  /**< Source tensor */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISENOTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
deleted file mode 100644
index 5b532510ad..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISEORKERNEL_H
-#define ARM_COMPUTE_NEBITWISEORKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise inclusive OR between two tensors
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
- */
-class NEBitwiseOrKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseOrKernel";
-    }
-    /** Default constructor */
-    NEBitwiseOrKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseOrKernel(const NEBitwiseOrKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseOrKernel &operator=(const NEBitwiseOrKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input1 An input tensor. Data type supported: U8.
-     * @param[in]  input2 An input tensor. Data type supported: U8
-     * @param[out] output Output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input1; /**< Source tensor 1 */
-    const ITensor *_input2; /**< Source tensor 2 */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISEORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
deleted file mode 100644
index 0d9120501b..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISEXORKERNEL_H
-#define ARM_COMPUTE_NEBITWISEXORKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise exclusive OR (XOR) between two tensors
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
- */
-class NEBitwiseXorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseXorKernel";
-    }
-    /** Default constructor */
-    NEBitwiseXorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseXorKernel(const NEBitwiseXorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseXorKernel &operator=(const NEBitwiseXorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input1 An input tensor. Data type supported: U8.
-     * @param[in]  input2 An input tensor. Data type supported: U8
-     * @param[out] output The output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input1; /**< Source tensor 1 */
-    const ITensor *_input2; /**< Source tensor 2 */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISEXORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
deleted file mode 100644
index e94f228f2a..0000000000
--- a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
-#define ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the bounding box kernel */
-class NEBoundingBoxTransformKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBoundingBoxTransformKernel";
-    }
-
-    /** Default constructor */
-    NEBoundingBoxTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBoundingBoxTransformKernel(const NEBoundingBoxTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBoundingBoxTransformKernel &operator=(const NEBoundingBoxTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBoundingBoxTransformKernel(NEBoundingBoxTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBoundingBoxTransformKernel &operator=(NEBoundingBoxTransformKernel &&) = default;
-    /** Default destructor */
-    ~NEBoundingBoxTransformKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                        Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input.
-     * @param[in]  info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
-     *
-     * @param[in] boxes      Source tensor info. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[in] pred_boxes Destination tensor info. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in] deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                       Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input.
-     * @param[in] info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    void internal_run(const Window &window);
-
-    const ITensor           *_boxes;
-    ITensor                 *_pred_boxes;
-    const ITensor           *_deltas;
-    BoundingBoxTransformInfo _bbinfo;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
deleted file mode 100644
index 448e33be3c..0000000000
--- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBOX3x3KERNEL_H
-#define ARM_COMPUTE_NEBOX3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a Box 3x3 filter */
-class NEBox3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBox3x3Kernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data type supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform a Box 3x3 filter for FP16 datatype
- */
-class NEBox3x3FP16Kernel : public NEBox3x3Kernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBox3x3FP16Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** NEON kernel to perform a Box 3x3 filter for FP16 datatype */
-using NEBox3x3FP16Kernel = NEBox3x3Kernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBOX3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
deleted file mode 100644
index 1979c5bd2b..0000000000
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECANNYEDGEKERNEL_H
-#define ARM_COMPUTE_NECANNYEDGEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Computes magnitude and quantised phase from inputs gradients. */
-class NEGradientKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGradientKernel";
-    }
-    /** Default constructor */
-    NEGradientKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGradientKernel(const NEGradientKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGradientKernel &operator=(const NEGradientKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGradientKernel(NEGradientKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGradientKernel &operator=(NEGradientKernel &&) = default;
-    /** Default destructor */
-    virtual ~NEGradientKernel() = default;
-
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and magnitude must all be the same size (either 16 or 32)
-     *
-     * @param[in]  gx        Source tensor - Gx component. Data type supported: S16/S32.
-     * @param[in]  gy        Source tensor - Gy component. Data type supported: same as @p gx.
-     * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32).
-     * @param[out] phase     Destination tensor - Quantized phase. Data type supported: U8.
-     * @param[in]  norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm
-     */
-    virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Common signature for all the specialised gradient functions
-     *
-     * @param[in]  gx_ptr        Pointer to the first input tensor.
-     * @param[in]  gy_ptr        Pointer to the second input tensor.
-     * @param[out] magnitude_ptr Pointer to the first output tensor
-     * @param[out] phase_ptr     Pointer to the second output tensor
-     */
-    using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr);
-
-    GradientFunction *_func;      /**< Gradient function to use for the particular tensor types passed to configure() */
-    const ITensor    *_gx;        /**< Source tensor - Gx component */
-    const ITensor    *_gy;        /**< Source tensor - Gy component */
-    ITensor          *_magnitude; /**< Destination tensor - Magnitude */
-    ITensor          *_phase;     /**< Destination tensor - Quantized phase */
-};
-
-/** NEON kernel to perform Non-Maxima suppression for Canny Edge.
- *
- * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
- *       to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE.
- *
- * @note Hysteresis is computed in @ref NEEdgeTraceKernel
- */
-class NEEdgeNonMaxSuppressionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEEdgeNonMaxSuppressionKernel";
-    }
-    /** Default constructor */
-    NEEdgeNonMaxSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
-    /** Default destructor */
-    ~NEEdgeNonMaxSuppressionKernel() = default;
-
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  magnitude        Source tensor - Magnitude. Data type supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data type supported: U8.
-     * @param[out] output           Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge"
-     * @param[in]  upper_thr        Upper threshold used for the hysteresis
-     * @param[in]  lower_thr        Lower threshold used for the hysteresis
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Common signature for all the specialised non-maxima suppression functions
-     *
-     * @param[in]  magnitude_ptr Pointer to the first input tensor.
-     * @param[in]  phase_ptr     Pointer to the second input tensor.
-     * @param[out] output_ptr    Pointer to the output tensor
-     * @param[in]  stride_mag    Stride of the magnitude tensor
-     * @param[in]  upper_thr     Upper threshold used for the hysteresis
-     * @param[in]  lower_thr     Lower threshold used for the hysteresis
-     */
-    using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr,
-                                         const int32_t lower_thr);
-
-    EdgeNonMaxSupprFunction *_func;      /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
-    const ITensor           *_magnitude; /**< Source tensor - Magnitude */
-    const ITensor           *_phase;     /**< Source tensor - Quantized phase */
-    ITensor                 *_output;    /**< Destination tensor */
-    int32_t                  _lower_thr; /**< Lower threshold used for the hysteresis */
-    int32_t                  _upper_thr; /**< Upper threshold used for the hysteresis */
-};
-
-/** NEON kernel to perform Edge tracing */
-class NEEdgeTraceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEEdgeTraceKernel";
-    }
-    /** Default constructor */
-    NEEdgeTraceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
-    /** Default constructor */
-    ~NEEdgeTraceKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in,out] input  Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge"
-     * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge).
-     */
-    void configure(ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-    bool       is_parallelisable() const override;
-
-private:
-    ITensor *_input;  /**< Source tensor */
-    ITensor *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECANNYEDGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
deleted file mode 100644
index 8f019384d9..0000000000
--- a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
-#define ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <array>
-#include <cstdint>
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the channel combine kernel */
-class NEChannelCombineKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelCombineKernel";
-    }
-    /** Default constructor */
-    NEChannelCombineKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelCombineKernel(const NEChannelCombineKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelCombineKernel(NEChannelCombineKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelCombineKernel() = default;
-
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
-     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     */
-    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444
-     */
-    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Combine 3 planes to form a three channel single plane tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_3C(const Window &win);
-    /** Combine 4 planes to form a four channel single plane tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_4C(const Window &win);
-    /** Combine 3 planes to form a single plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    template <bool is_yuyv>
-    void combine_YUV_1p(const Window &win);
-    /** Combine 3 planes to form a two plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_YUV_2p(const Window &win);
-    /** Combine 3 planes to form a three plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_YUV_3p(const Window &win);
-    /** Copies a full plane to the output tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void copy_plane(const Window &win, uint32_t plane_id);
-    /** Common signature for all the specialised ChannelCombine functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window);
-    /** ChannelCombine function to use for the particular tensor types passed to configure() */
-    ChannelCombineFunction _func;
-    std::array<const ITensor *, 4> _planes;
-    ITensor     *_output;
-    IMultiImage *_output_multi;
-    std::array<uint32_t, 3> _x_subsampling;
-    std::array<uint32_t, 3> _y_subsampling;
-    unsigned int _num_elems_processed_per_iteration;
-    bool         _is_parallelizable;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
deleted file mode 100644
index 8d62016fe5..0000000000
--- a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
-#define ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the channel extract kernel */
-class NEChannelExtractKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelExtractKernel";
-    }
-    /** Default constructor */
-    NEChannelExtractKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelExtractKernel(const NEChannelExtractKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelExtractKernel(NEChannelExtractKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelExtractKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Format supported: u8
-     */
-    void configure(const ITensor *input, Channel channel, ITensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Single-planar destination image. Format supported: U8
-     */
-    void configure(const IMultiImage *input, Channel channel, IImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Extract one channel from a two channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_2C_img(const Window &win);
-    /** Extract one channel from a three channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_3C_img(const Window &win);
-    /** Extract one channel from a four channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_4C_img(const Window &win);
-    /** Extract U/V channel from a single planar YUVY/UYVY tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_YUYV_uv(const Window &win);
-    /** Copies a full plane to the output tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void copy_plane(const Window &win);
-    /** Common signature for all the specialised ChannelExtract functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window);
-    /** ChannelExtract function to use for the particular tensor types passed to configure() */
-    ChannelExtractFunction _func;
-    unsigned int           _lut_index;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h b/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
deleted file mode 100644
index 71659c4fcb..0000000000
--- a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
-#define ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the channel shuffle kernel */
-class NEChannelShuffleLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelShuffleLayerKernel";
-    }
-    /** Default constructor */
-    NEChannelShuffleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelShuffleLayerKernel(const NEChannelShuffleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelShuffleLayerKernel &operator=(const NEChannelShuffleLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelShuffleLayerKernel(NEChannelShuffleLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelShuffleLayerKernel &operator=(NEChannelShuffleLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelShuffleLayerKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  input      Input tensor. Data types supported: All
-     * @param[out] output     Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     */
-    void configure(const ITensor *input, ITensor *output, unsigned int num_groups);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEChannelShuffleLayerKernel
-     *
-     * @param[in]  input      Input tensor. Data types supported: All
-     * @param[out] output     Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _num_groups;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
deleted file mode 100644
index 9aa1062622..0000000000
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
-#define ARM_COMPUTE_NECOL2IMKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform col2im reshaping.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class NECol2ImKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECol2ImKernel";
-    }
-    /** Default constructor */
-    NECol2ImKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECol2ImKernel(const NECol2ImKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECol2ImKernel &operator=(const NECol2ImKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECol2ImKernel(NECol2ImKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECol2ImKernel &operator=(NECol2ImKernel &&) = default;
-    /** Default destructor */
-    ~NECol2ImKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: All
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in]  convolved_dims Output convolved dimensions.
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
-    /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel
-     *
-     * @param[in] input          The input tensor to convert. Data types supported: All
-     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in] convolved_dims Output convolved dimensions.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the col2im
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_col2im(const Window &window);
-
-    /** Common signature for all the specialised col2im functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window);
-
-    Col2ImFunctionPtr _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    Size2D            _convolved_dims;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOL2IMKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
deleted file mode 100644
index 3059288ab4..0000000000
--- a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_COLORCONVERTKERNEL_H
-#define ARM_COMPUTE_COLORCONVERTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the color convert kernel */
-class NEColorConvertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEColorConvertKernel";
-    }
-    /** Default constructor */
-    NEColorConvertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEColorConvertKernel(const NEColorConvertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEColorConvertKernel(NEColorConvertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default;
-    /** Default destructor */
-    ~NEColorConvertKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const IMultiImage *input, IImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const IImage *input, IMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const IMultiImage *input, IMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
-    const void           *_input;
-    void                 *_output;
-    ColorConvertFunction *_func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOLORCONVERTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index d45191949a..0000000000
--- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-#define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class NEConvertFullyConnectedWeightsKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvertFullyConnectedWeightsKernel";
-    }
-    /** Default constructor */
-    NEConvertFullyConnectedWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvertFullyConnectedWeightsKernel(const NEConvertFullyConnectedWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvertFullyConnectedWeightsKernel &operator=(const NEConvertFullyConnectedWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvertFullyConnectedWeightsKernel(NEConvertFullyConnectedWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvertFullyConnectedWeightsKernel &operator=(NEConvertFullyConnectedWeightsKernel &&) = default;
-    /** Default destructor */
-    ~NEConvertFullyConnectedWeightsKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel
-     *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
-     * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the permute
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_convert_fc_weights(const Window &window);
-
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _factor1; /*  equals to the number of elements per original input plane if @p data_layout == NCHW; its number of channels otherwise */
-    unsigned int   _factor2; /*  equals to the number of elements per original input plane if @p data_layout == NHWC; its number of channels otherwise */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
deleted file mode 100644
index 6ec2793484..0000000000
--- a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-#define ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** NEON kernel to convert asymmetric signed to asymmetric signed and vice-versa */
-class NEConvertQuantizedSignednessKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvertQuantizedSignednessKernel";
-    }
-    /** Default constructor */
-    NEConvertQuantizedSignednessKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output Destination tensor. Data types supported: opposite of @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output Destination tensor. Data types supported: opposite of @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
deleted file mode 100644
index 2b271de56b..0000000000
--- a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-#include <array>
-#include <cstdint>
-#include <vector>
-
-namespace arm_compute
-{
-class ITensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class NEConvolutionKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionKernel";
-    }
-    /** Default constructor */
-    NEConvolutionKernel();
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    template <typename OutputType>
-    void convolution(const Window &win);
-
-protected:
-    uint32_t _scale;                                             /**< scale of the convolution */
-    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
-};
-
-/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
-using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
-/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
-using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
-/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
-using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
-///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
-using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionHorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionHorKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionHorKernel();
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
-     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor..
-     *
-     * @param[in] window Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolve(const Window &window);
-
-    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
-    BorderSize _border_size;                    /**< Border size */
-};
-
-/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionVertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionVertKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionVertKernel();
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as U16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_u16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S32.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s32(const Window &win);
-
-    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
-    uint32_t _scale;                            /**< Convolution's scale */
-};
-
-/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
-using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
-using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
-using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class NEConvolutionRectangleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionRectangleKernel";
-    }
-    /** Default constructor */
-    NEConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int get_index(uint32_t val);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType, unsigned int rows, unsigned int cols>
-    void convolution(const Window &win);
-
-protected:
-    const ITensor            *_input;       /**< Input tensor */
-    ITensor                  *_output;      /**< Output tensor */
-    uint32_t                  _scale;       /**< Scale of the convolution */
-    std::vector<int16_t>      _convolution; /**< Convolution matrix */
-    BorderSize                _border_size; /**< Calculated border width */
-    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
-    const static unsigned int _nr_supported_sizes
-    {
-        4
-    }; /**< Number of supported permutations */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVOLUTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECopyKernel.h b/arm_compute/core/NEON/kernels/NECopyKernel.h
deleted file mode 100644
index d2dbbaef98..0000000000
--- a/arm_compute/core/NEON/kernels/NECopyKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECOPYKERNEL_H
-#define ARM_COMPUTE_NECOPYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a copy between two tensors */
-class NECopyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECopyKernel";
-    }
-    /** Default constructor */
-    NECopyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NECopyKernel(const NECopyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NECopyKernel &operator=(const NECopyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECopyKernel(NECopyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECopyKernel &operator=(NECopyKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input   Source tensor. Data types supported: All
-     * @param[out] output  Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding (Optional) Padding to be applied to the input tensor
-     */
-    void configure(const ITensor *input, ITensor *output, const PaddingList &padding = PaddingList());
-    /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
-     *
-     * @param[in] input   Source tensor. Data types supported: All
-     * @param[in] output  Destination tensor. Data types supported: same as @p input.
-     * @param[in] padding (Optional) Padding to be applied to the input tensor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    PaddingList    _padding;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOPYKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECropKernel.h b/arm_compute/core/NEON/kernels/NECropKernel.h
deleted file mode 100644
index ba58ab1e58..0000000000
--- a/arm_compute/core/NEON/kernels/NECropKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEON_CROP_KERNEL_H
-#define ARM_COMPUTE_NEON_CROP_KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor cropping */
-class NECropKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECropKernel";
-    }
-    /** Default constructor */
-    NECropKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECropKernel(const NECropKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECropKernel &operator=(const NECropKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECropKernel(NECropKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECropKernel &operator=(NECropKernel &&) = default;
-    /** Default destructor */
-    ~NECropKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     * @note Padding not supported.
-     *
-     * @param[in]  input               Source tensor. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
-     * @param[in]  crop_boxes          Tensor containing all possible boxes used to crop the image, each represented by 4 normalized values.
-     *                                 Data type supported: F32
-     * @param[in]  box_ind             One dimensional tensor mapping the @p crop_box_ind to the index of the 3D image in @p input.
-     *                                 Data type supported: F32
-     * @param[out] output              Destination tensor. Data type supported: F32
-     * @param[in]  crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     */
-    void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     * @note Padding not supported.
-     *
-     * @param[in] input               Source tensor info. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
-     * @param[in] crop_boxes          Tensor info for tensor containing all possible boxes used to crop the image. Data type supported: F32
-     * @param[in] box_ind             Tensor info for the one dimensional tensor mapping the @p crop_box_ind to the index of the 3D image
-     *                                in @p input. Data type supported: F32
-     * @param[in] output              Destination tensor. Data type supported: F32
-     * @param[in] crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
-     * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
-
-    /** Configure output tensor's shape as this can only be determined at runtime. */
-    void configure_output_shape();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Function to use for in bounds crop for the particular tensor types passed to configure() */
-    using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
-
-private:
-    const ITensor *_input;
-    const ITensor *_crop_boxes;
-    const ITensor *_box_ind;
-    ITensor       *_output;
-
-    Coordinates _start;
-    Coordinates _end;
-    uint32_t    _crop_box_ind;
-    float       _extrapolation_value;
-    /** The number of rows out of bounds at the start and end of output. */
-    std::array<uint32_t, 2> _rows_out_of_bounds;
-    /** The number of columns out of bounds at the start and end of output. */
-    std::array<uint32_t, 2> _cols_out_of_bounds;
-
-    NECropKernel::InBoundsCropFunction *_in_bounds_crop_function;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
deleted file mode 100644
index 52442c3920..0000000000
--- a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class IDistribution1D;
-class ILut;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the cumulative distribution (cummulative summmation) calculation kernel.
- *
- * This kernel calculates the cumulative sum of a given distribution (meaning that each output element
- * is the sum of all its previous elements including itself) and creates a lookup table with the normalized
- * pixel intensities which is used for improve the constrast of the image.
- */
-class NECumulativeDistributionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECumulativeDistributionKernel";
-    }
-    /** Default constructor */
-    NECumulativeDistributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECumulativeDistributionKernel(const NECumulativeDistributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECumulativeDistributionKernel &operator=(const NECumulativeDistributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default;
-    /** Set the input and output distribution.
-     *
-     * @param[in]  input          Input image. Data type supported: U8
-     * @param[in]  distribution   Unnormalized 256-bin distribution of the input image.
-     * @param[out] cumulative_sum Cummulative distribution (Summed histogram). Should be same size as @p distribution.
-     * @param[out] output         Equalization lookup table. Should consist of 256 entries of U8 elements.
-     */
-    void configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    const IImage          *_input;          /**< Input image. */
-    const IDistribution1D *_distribution;   /**< Input histogram of the input image. */
-    IDistribution1D       *_cumulative_sum; /**< The cummulative distribution. */
-    ILut                  *_output;         /**< Output with the equalization lookup table. */
-private:
-    static const uint32_t _histogram_size = 256; /**< Default histogram size of 256. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
deleted file mode 100644
index 6690ac2236..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEDepthConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConcatenateLayerKernel(const NEDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConcatenateLayerKernel &operator=(const NEDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDepthConcatenateLayerKernel(NEDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDepthConcatenateLayerKernel &operator=(NEDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ITensor *input, unsigned int depth_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEDepthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported:  QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using DepthConcatFunction = void(const ITensor *in, ITensor *out, unsigned int depth_offset, const Window &window);
-
-private:
-    DepthConcatFunction *_func;
-    const ITensor       *_input;
-    ITensor             *_output;
-    unsigned int         _depth_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
deleted file mode 100644
index 5cda3203ed..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_DEPTHCONVERTKERNEL_H
-#define ARM_COMPUTE_DEPTHCONVERTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Depth conversion kernel
- *  This function ignores the scale and zeroPoint of quanized tensors, i.e. QASYMM8 input is treated as uint8 values.
- */
-class NEDepthConvertLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthConvertLayerKernel";
-    }
-    /** Default constructor*/
-    NEDepthConvertLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConvertLayerKernel(const NEDepthConvertLayerKernel &) = delete;
-    /** Default move constructor */
-    NEDepthConvertLayerKernel(NEDepthConvertLayerKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConvertLayerKernel &operator=(const NEDepthConvertLayerKernel &) = delete;
-    /** Default move assignment operator */
-    NEDepthConvertLayerKernel &operator=(NEDepthConvertLayerKernel &&) = default;
-    /** Set the input and output of the kernel
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8        -> U16, S16, S32, F32, F16
-     *   - U8             -> U16, S16, S32, F32, F16
-     *   - U16            -> U8, U32
-     *   - S16            -> QASYMM8_SIGNED, U8, S32
-     *   - BFLOAT16       -> F32
-     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
-     *
-     * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
-     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
-     * @param[in]  policy Conversion policy.
-     * @param[in]  shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthConvertLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
-     * @param[in] policy Conversion policy
-     * @param[in] shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
-    uint32_t       _shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
deleted file mode 100644
index 0b645887ee..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the depth to space kernel */
-class NEDepthToSpaceLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthToSpaceLayerKernel";
-    }
-    /** Default constructor */
-    NEDepthToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthToSpaceLayerKernel(const NEDepthToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthToSpaceLayerKernel &operator=(const NEDepthToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDepthToSpaceLayerKernel(NEDepthToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDepthToSpaceLayerKernel &operator=(NEDepthToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDepthToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape x value.
-     */
-    void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthToSpaceLayerKernel.
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    int32_t        _block_shape; /**< Block shape */
-    DataLayout     _data_layout; /**< Data layout of the operation */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index 227ddb4743..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. */
-class NEDepthwiseConvolutionLayer3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionLayer3x3Kernel";
-    }
-    /** Default constructor */
-    NEDepthwiseConvolutionLayer3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayer3x3Kernel(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayer3x3Kernel &operator=(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionLayer3x3Kernel(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @note Supported data layouts: NCHW and NHWC
-     *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     */
-    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel
-     *
-     * @note Supported data layouts: NCHW and NHWC
-     *
-     * @param[in] input            Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize     _border_size;
-    const ITensor *_input;
-    ITensor       *_output;
-    const ITensor *_weights;
-    PadStrideInfo  _conv_info;
-    unsigned int   _num_elems_written_per_iteration;
-    unsigned int   _depth_multiplier;
-    Size2D         _dilation;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
deleted file mode 100644
index 9737c9932e..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionLayerNativeKernel";
-    }
-    /** Default constructor */
-    NEDepthwiseConvolutionLayerNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayerNativeKernel(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayerNativeKernel &operator=(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionLayerNativeKernel(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionLayerNativeKernel &operator=(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Initialize the function's source, destination and parameters.
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                              Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                   const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerNativeKernel
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in] input            Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
-     *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    template < typename T, typename TW, int S, typename std::enable_if < std::is_same<T, float>::value
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                         || std::is_same<T, float16_t>::value
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                         ,
-                                                                         int >::type = 0 >
-    void run_depthwise(const Window &window, bool has_biases);
-
-    template < typename T, typename TW, int S, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-    void run_depthwise(const Window &window, bool has_biases);
-
-    /** Common signature for all the specialised depthwise convolution native functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases);
-
-    DepthwiseFunctionPtr _func;
-    BorderSize           _border_size;
-    const ITensor       *_input;
-    const ITensor       *_weights;
-    const ITensor       *_biases;
-    ITensor             *_output;
-    PadStrideInfo        _conv_info;
-    unsigned int         _depth_multiplier;
-    Size2D               _dilation;
-    std::vector<int>     _output_multiplier;
-    std::vector<int>     _output_shift;
-    bool                 _has_biases;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
deleted file mode 100644
index 3792fb3bd7..0000000000
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the dequantization layer kernel. */
-class NEDequantizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDequantizationLayerKernel";
-    }
-    /** Default constructor */
-    NEDequantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDequantizationLayerKernel(const NEDequantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDequantizationLayerKernel &operator=(const NEDequantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEDequantizationLayerKernel(NEDequantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEDequantizationLayerKernel &operator=(NEDequantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDequantizationLayerKernel() = default;
-    /** Set input, output tensors.
-     *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] output Output tensor info. Data types supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
deleted file mode 100644
index 20aee9b5ce..0000000000
--- a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDERIVATIVEKERNEL_H
-#define ARM_COMPUTE_NEDERIVATIVEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the derivative along the X/Y directions on a tensor.
- *
- */
-class NEDerivativeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDerivativeKernel";
-    }
-    /** Default constructor */
-    NEDerivativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDerivativeKernel(const NEDerivativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDerivativeKernel(NEDerivativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform derivative along the X direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_x(const Window &window);
-    /** Function to perform derivative along the Y direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_y(const Window &window);
-    /** Function to perform derivative along the X and Y direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_xy(const Window &window);
-    /** Common signature for all the specialised derivative functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window);
-    /** Derivative function to use for the particular tensor types passed to configure() */
-    DerivativeFunction _func;
-
-private:
-    const ITensor *_input;    /**< Input tensor */
-    ITensor       *_output_x; /**< Output tensor - Derivate along the X direction */
-    ITensor       *_output_y; /**< Output tensor - Derivate along the Y direction */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDERIVATIVEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/arm_compute/core/NEON/kernels/NEDilateKernel.h
deleted file mode 100644
index 00a954d958..0000000000
--- a/arm_compute/core/NEON/kernels/NEDilateKernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDILATEKERNEL_H
-#define ARM_COMPUTE_NEDILATEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform boolean image dilatation */
-class NEDilateKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDilateKernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDILATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
deleted file mode 100644
index 4ae283d69d..0000000000
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON interface for Direct Convolution Layer kernel */
-class NEDirectConvolutionLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDirectConvolutionLayerKernel";
-    }
-    /** Default constructor */
-    NEDirectConvolutionLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDirectConvolutionLayerKernel() = default;
-    /** Set the input, weights, and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *
-     * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
-     *
-     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_weights;
-    ITensor       *_output;
-    PadStrideInfo  _conv_info;
-    BorderSize     _border_size;
-    unsigned int   _kernel_size;
-    unsigned int   _num_weight_elems_read_per_row;
-    unsigned int   _num_elems_read_per_iteration;
-    unsigned int   _num_elems_written_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
deleted file mode 100644
index b7632d70c4..0000000000
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-/** NEON kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @note We assume bias to be shared
- * @note For quantized computations (i.e. @p input of S32 type) the output data type for auto-initialization must be passed as part
- *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
- */
-class NEDirectConvolutionLayerOutputStageKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDirectConvolutionLayerOutputStageKernel";
-    }
-    /** Default constructor */
-    NEDirectConvolutionLayerOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerOutputStageKernel(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerOutputStageKernel &operator=(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerOutputStageKernel(NEDirectConvolutionLayerOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerOutputStageKernel &operator=(NEDirectConvolutionLayerOutputStageKernel &&) = default;
-    /** Default destructor */
-    ~NEDirectConvolutionLayerOutputStageKernel() = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                        Data type supported: F16/F32/S32
-     * @param[in]      bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
-     * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                        Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                        Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
-     * @param[in]      info   (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     */
-    void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
-     *
-     * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                   Data type supported: F16/F32/S32
-     * @param[in] bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
-     * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                   Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                   Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
-     * @param[in] info   (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
-
-private:
-    OutputStageKernel *_func;
-    ITensor           *_input;
-    const ITensor     *_bias;
-    ITensor           *_output;
-    int                _result_fixedpoint_multiplier;
-    int                _result_shift;
-    int                _result_offset_after_shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
deleted file mode 100644
index 61c25e1a2a..0000000000
--- a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
-#define ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
- *
- */
-class NEElementwiseOperationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEElementwiseOperationKernel";
-    }
-    /** Default constructor */
-    NEElementwiseOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseOperationKernel(const NEElementwiseOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseOperationKernel &operator=(const NEElementwiseOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEElementwiseOperationKernel(NEElementwiseOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEElementwiseOperationKernel &operator=(NEElementwiseOperationKernel &&) = default;
-    /** Default destructor */
-    ~NEElementwiseOperationKernel() = default;
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Common signature for all the specialised arithmetic functions
-     *
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Dependent on subclass.
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ElementwiseFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
-
-protected:
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Dependent on subclass.
-     */
-    static Status validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Function to use for the particular tensor types passed to configure() */
-    std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)> _function;
-
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-};
-
-class NEArithmeticOperationKernel : public NEElementwiseOperationKernel
-{
-public:
-    /** Default constructor */
-    NEArithmeticOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-
-class NEDivisionOperationKernel : public NEArithmeticOperationKernel
-{
-public:
-    /** Default constructor */
-    NEDivisionOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] input1 First tensor input. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-
-class NEPowerOperationKernel : public NEArithmeticOperationKernel
-{
-public:
-    /** Default constructor */
-    NEPowerOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in]  input1 First tensor input. Data types supported: F16/F32.
-     * @param[in]  input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-
-class NEComparisonOperationKernel : public NEElementwiseOperationKernel
-{
-public:
-    /** Default constructor */
-    NEComparisonOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
-     *
-     * @param[in] op     Comparison operation to be executed.
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: U16/U32.
-     */
-    void configure(ComparisonOperation op, const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
-     *
-     * @param[in] op     Comparison operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
-     *
-     * @return a Status
-     */
-    static Status validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
deleted file mode 100644
index 9a41cecf19..0000000000
--- a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
-#define ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x) = OP(input(x))@f]
- *
- */
-class NEElementwiseUnaryKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEElementwiseUnaryKernel";
-    }
-    /** Default constructor */
-    NEElementwiseUnaryKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseUnaryKernel(const NEElementwiseUnaryKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseUnaryKernel &operator=(const NEElementwiseUnaryKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEElementwiseUnaryKernel(NEElementwiseUnaryKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEElementwiseUnaryKernel &operator=(NEElementwiseUnaryKernel &&) = default;
-    /** Default destructor */
-    ~NEElementwiseUnaryKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input. Data types supported: F16/F32.
-     * @param[in] output Output tensor. Data types supported: Same as @p input.
-     */
-    void configure(ElementWiseUnary op, const ITensor *input, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a Status
-     */
-    static Status validate(ElementWiseUnary op, const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised arithmetic functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ElementwiseUnaryPtr = void (NEElementwiseUnaryKernel::*)(const Window &window);
-
-    /** Template function to run elementwise unary operation
-     *
-     * @tparam ScalarType Scalar datatype
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename ScalarType>
-    void elementwise_op(const Window &window);
-
-    ElementwiseUnaryPtr _func;
-    const ITensor      *_input;
-    ITensor            *_output;
-    ElementWiseUnary    _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/arm_compute/core/NEON/kernels/NEErodeKernel.h
deleted file mode 100644
index e3fcc2847e..0000000000
--- a/arm_compute/core/NEON/kernels/NEErodeKernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEERODEKERNEL_H
-#define ARM_COMPUTE_NEERODEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform boolean image erosion */
-class NEErodeKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEErodeKernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEERODEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
deleted file mode 100644
index ed17e3b8d5..0000000000
--- a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
-#define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the digit reverse operation kernel. */
-class NEFFTDigitReverseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFFTDigitReverseKernel";
-    }
-    /** Constructor */
-    NEFFTDigitReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTDigitReverseKernel(const NEFFTDigitReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTDigitReverseKernel &operator=(const NEFFTDigitReverseKernel &) = delete;
-    /** Default Move Constructor. */
-    NEFFTDigitReverseKernel(NEFFTDigitReverseKernel &&) = default;
-    /** Default move assignment operator */
-    NEFFTDigitReverseKernel &operator=(NEFFTDigitReverseKernel &&) = default;
-    /** Default destructor */
-    ~NEFFTDigitReverseKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[out] output Destination tensor. Data type supported: same as @p input. Number of channels supported: 2 (complex tensor).
-     * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
-     * @param[in]  config Kernel configuration.
-     */
-    void configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTDigitReverseKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: 2 (complex tensor).
-     * @param[in] idx    Digit reverse index tensor info. Data type supported: U32
-     * @param[in] config Kernel configuration
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using NEFFTDigitReverseKernelFunctionPtr = void (NEFFTDigitReverseKernel::*)(const Window &window);
-
-    template <bool is_input_complex, bool is_conj>
-    void digit_reverse_kernel_axis_0(const Window &window);
-
-    template <bool is_input_complex, bool is_conj>
-    void digit_reverse_kernel_axis_1(const Window &window);
-
-    NEFFTDigitReverseKernelFunctionPtr _func;
-    const ITensor                     *_input;
-    ITensor                           *_output;
-    const ITensor                     *_idx;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
deleted file mode 100644
index 6e16fca0fb..0000000000
--- a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
-#define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <arm_neon.h>
-#include <set>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the FFT kernel. */
-class NEFFTRadixStageKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFFTRadixStageKernel";
-    }
-    /** Constructor */
-    NEFFTRadixStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTRadixStageKernel(const NEFFTRadixStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTRadixStageKernel &operator=(const NEFFTRadixStageKernel &) = delete;
-    /** Default Move Constructor. */
-    NEFFTRadixStageKernel(NEFFTRadixStageKernel &&) = default;
-    /** Default move assignment operator */
-    NEFFTRadixStageKernel &operator=(NEFFTRadixStageKernel &&) = default;
-    /** Default destructor */
-    ~NEFFTRadixStageKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is nullptr, the FFT will be performed in-place
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[out]    output Destination tensor. Data type supported: same as @p input. Number of channels supported: same as @p input.
-     * @param[in]     config FFT descriptor metadata.
-     */
-    void configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTRadixStageKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: same as @p input.
-     * @param[in] config FFT descriptor metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
-    /** Returns the radix that are support by the FFT kernel
-     *
-     * @return A set of supported radix
-     */
-    static std::set<unsigned int> supported_radix();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor     *_input;
-    ITensor     *_output;
-    bool         _run_in_place;
-    unsigned int _Nx;
-    unsigned int _axis;
-    unsigned int _radix;
-
-    void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
-    void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
-
-    using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
-    using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int)>;
-
-    FFTFunctionPointerAxis0 _func_0;
-    FFTFunctionPointerAxis1 _func_1;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h b/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
deleted file mode 100644
index 72963fa56d..0000000000
--- a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
-#define ARM_COMPUTE_NEFFTSCALEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the inverse fft scale kernel. */
-class NEFFTScaleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFFTScaleKernel";
-    }
-    /** Constructor */
-    NEFFTScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTScaleKernel(const NEFFTScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTScaleKernel &operator=(const NEFFTScaleKernel &) = delete;
-    /** Default Move Constructor. */
-    NEFFTScaleKernel(NEFFTScaleKernel &&) = default;
-    /** Default move assignment operator */
-    NEFFTScaleKernel &operator=(NEFFTScaleKernel &&) = default;
-    /** Default destructor */
-    ~NEFFTScaleKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[out]    output Destination tensor. Data type supported: same as @p input. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[in]     config Kernel configuration
-     */
-    void configure(ITensor *input, ITensor *output, const FFTScaleKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTScaleKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[in] config Kernel configuration
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor *_input;
-    ITensor *_output;
-    float    _scale;
-    bool     _run_in_place;
-    bool     _is_conj;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFFTSCALEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
deleted file mode 100644
index c0196c711a..0000000000
--- a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFASTCORNERSKERNEL_H
-#define ARM_COMPUTE_NEFASTCORNERSKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** NEON kernel to perform fast corners */
-class NEFastCornersKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFastCornersKernel";
-    }
-    /** Constructor */
-    NEFastCornersKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFastCornersKernel(const NEFastCornersKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFastCornersKernel(NEFastCornersKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
-    /** Initialise the kernel.
-     *
-     * @param[in]  input               Source image. Data type supported: U8.
-     * @param[out] output              Output image. Data type supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_undefined    True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const IImage *_input;               /**< source image */
-    IImage       *_output;              /**< inermediate results */
-    uint8_t       _threshold;           /**< threshold on difference between intensity */
-    bool          _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEFASTCORNERSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
deleted file mode 100644
index e45caec34b..0000000000
--- a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFILLARRAYKERNEL_H
-#define ARM_COMPUTE_NEFILLARRAYKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** This kernel adds all texels greater than or equal to the threshold value to the keypoint array. */
-class NEFillArrayKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFillArrayKernel";
-    }
-    /** Default contructor */
-    NEFillArrayKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillArrayKernel(const NEFillArrayKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillArrayKernel &operator=(const NEFillArrayKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFillArrayKernel(NEFillArrayKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFillArrayKernel &operator=(NEFillArrayKernel &&) = default;
-    /** Default detructor */
-    ~NEFillArrayKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input     Source image. Data type supported: U8.
-     * @param[in]  threshold Texels greater than the threshold will be added to the array.
-     * @param[out] output    Arrays of keypoints to store the results.
-     */
-    void configure(const IImage *input, uint8_t threshold, IKeyPointArray *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    const IImage   *_input;
-    IKeyPointArray *_output;
-    uint8_t         _threshold;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEFILLARRAYKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
deleted file mode 100644
index 0c852e8232..0000000000
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFILLBORDERKERNEL_H
-#define ARM_COMPUTE_NEFILLBORDERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to fill borders */
-class NEFillBorderKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFillBorderKernel";
-    }
-    /** Default Constructor */
-    NEFillBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillBorderKernel(const NEFillBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillBorderKernel &operator=(const NEFillBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFillBorderKernel(NEFillBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFillBorderKernel &operator=(NEFillBorderKernel &&) = default;
-    /** Default destructor */
-    ~NEFillBorderKernel() = default;
-
-    /** Initialise the function.
-     *
-     * @note This kernel fills the borders within the XY-planes.
-     *
-     * @param[in,out] tensor                Tensor to process. Data types supported: All.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    void fill_replicate_single_channel(const Window &window);
-    void fill_constant_value_single_channel(const Window &window);
-
-    ITensor   *_tensor;
-    BorderSize _border_size;
-    BorderMode _mode;
-    PixelValue _constant_border_value;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFILLBORDERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
deleted file mode 100644
index 9c1059e606..0000000000
--- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H
-#define ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to fill the interior borders */
-class NEFillInnerBorderKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFillInnerBorderKernel";
-    }
-    /** Default constructor */
-    NEFillInnerBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default;
-    /** Default destructor */
-    ~NEFillInnerBorderKernel() = default;
-
-    /** Initialise the function.
-     *
-     * @note This kernel fills the borders within the XY-planes.
-     *
-     * @param[in,out] input                 Tensor to process. Data types supported: U8/S16/S32/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    void fill_value_single_channel(const Window &window);
-
-    ITensor   *_tensor;
-    BorderSize _border_size;
-    PixelValue _constant_border_value;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h b/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
deleted file mode 100644
index ba2f99857f..0000000000
--- a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
-#define ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the flatten layer kernel. */
-class NEFlattenLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFlattenLayerKernel";
-    }
-    /** Default constructor */
-    NEFlattenLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFlattenLayerKernel(const NEFlattenLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFlattenLayerKernel &operator=(const NEFlattenLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFlattenLayerKernel(NEFlattenLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFlattenLayerKernel &operator=(NEFlattenLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEFlattenLayerKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFlattenLayerKernel
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFLATTENLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFloorKernel.h b/arm_compute/core/NEON/kernels/NEFloorKernel.h
deleted file mode 100644
index 4cdd9f2ac0..0000000000
--- a/arm_compute/core/NEON/kernels/NEFloorKernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFLOORKERNEL_H
-#define ARM_COMPUTE_NEFLOORKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a floor operation */
-class NEFloorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFloorKernel";
-    }
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: F16/F32.
-     * @param[out] output Destination tensor. Same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFloorKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: F16/F32.
-     * @param[in] output Destination tensor info. Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFLOORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
deleted file mode 100644
index f598530d1e..0000000000
--- a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** OpenNE kernel to fuse the batch normalization node to a preceding convolution node */
-class NEFuseBatchNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFuseBatchNormalizationKernel";
-    }
-    /** Default constructor */
-    NEFuseBatchNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFuseBatchNormalizationKernel(const NEFuseBatchNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFuseBatchNormalizationKernel &operator=(const NEFuseBatchNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFuseBatchNormalizationKernel(NEFuseBatchNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFuseBatchNormalizationKernel &operator=(NEFuseBatchNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEFuseBatchNormalizationKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
-     * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
-     * @param[out] fused_weights (Optional) Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[out] fused_bias    (Optional) Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in]  input_bias    (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in]  bn_beta       (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_beta is set to 0.0
-     * @param[in]  bn_gamma      (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_gamma is set to 1.0
-     * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
-     *
-     * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in] bn_mean       Batch normalization layer mean tensor info. Same as @p input_weights
-     * @param[in] bn_var        Batch normalization layer variance tensor info. Same as @p input_weights
-     * @param[in] fused_weights (Optional) Output fused weights tensor info. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[in] fused_bias    (Optional) Output fused bias tensor info. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in] input_bias    (Optional) Input bias tensor info for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in] bn_beta       (Optional) Batch normalization layer beta tensor info. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_beta is set to 0.0
-     * @param[in] bn_gamma      (Optional) Batch normalization layer gamma tensor info. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_gamma is set to 1.0
-     * @param[in] epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in] fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input_weights;
-    const ITensor *_input_bias;
-    const ITensor *_bn_mean;
-    const ITensor *_bn_var;
-    const ITensor *_bn_gamma;
-    const ITensor *_bn_beta;
-    ITensor       *_fused_weights;
-    ITensor       *_fused_bias;
-    float          _epsilon;
-    bool           _run_in_place_weights;
-    bool           _run_in_place_bias;
-
-    using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                       const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
-
-    FuseBatchNormFunction *_func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
deleted file mode 100644
index 6aa8e250a4..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-#define ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Base class for GEMM NEON kernels implemented in Assembly. */
-class NEGEMMAssemblyBaseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMAssemblyBaseKernel";
-    }
-    /** Constructor */
-    NEGEMMAssemblyBaseKernel()
-        : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false)
-    {
-    }
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
-
-    virtual ~NEGEMMAssemblyBaseKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * The computed function is C = a * AxB + b * C.
-     *
-     * @param[in]     input0          Input tensor containing the Matrix A. Data types supported: F32
-     * @param[in]     input1          Input tensor containing the Matrix B. Data types supported: same as @p input0
-     * @param[in,out] output          Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
-     * @param[out]    workspace       Space for intermediate results.
-     * @param[in]     alpha           Weight of the matrix product
-     * @param[in]     beta            Weight of the accumulation.
-     * @param[in]     is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false)
-     * @param[in]     is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false)
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false)
-    {
-        internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1);
-    }
-
-protected:
-    virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0;
-
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    ITensor       *_workspace;
-    float          _alpha;
-    float          _beta;
-    bool           _is_transposed_0;
-    bool           _is_transposed_1;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMASSEMBLYBASE_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
deleted file mode 100644
index b6e6beab53..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
-#define ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to interleave the elements of a matrix
- *
- * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
- */
-class NEGEMMInterleave4x4Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMInterleave4x4Kernel";
-    }
-    /* Constructor */
-    NEGEMMInterleave4x4Kernel();
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the transpose functions
-     *
-     * @param[in]  input  An input tensor. Data types supported: All
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
-
-    GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index 8f47c5089d..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to multiply matrices
- *
- * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
- *  This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixMultiplyKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
-     * kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
-     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
-     *
-     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
-     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    bool           _slide_matrix_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index b069e4cfac..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class NEGEMMLowpOffsetContributionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpOffsetContributionKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_vector_sum_col;
-    const ITensor *_vector_sum_row;
-    ITensor       *_mm_result;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-    int32_t        _k_offset;
-    bool           _slide_vector_sum_col;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 0dc64c9842..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel.
- *
- * The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
- * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
- *
- * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
- *
- * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
- *
- * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * and mm_result'[i][k] = mm_result[i][k] +
- *                        (vector_sum_col[k] * a_offset) +
- *                        (vector_sum_row[i] * b_offset) +
- *                        (a_offset * b_offset * k)
- */
-
-class NEGEMMLowpOffsetContributionOutputStageKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpOffsetContributionOutputStageKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionOutputStageKernel(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionOutputStageKernel(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]  vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     * @param[in]  bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[out] output         Output tensor containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  k              Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
-     */
-    void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   GEMMLowpOutputStageInfo output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionOutputStageKernel
-     *
-     * @param[in] mm_result      Input tensor info containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in] vector_sum_col Tensor info for the input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Tensor info for the input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[in] output         Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     * @param[in] output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t                 b_offset,
-                           GEMMLowpOutputStageInfo output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    using NEGEMMLowpOffsetContributionOutputStageFunction = std::function<void(const Window, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                                               ITensor *, int32_t, int32_t, int32_t, bool, GEMMLowpOutputStageInfo)>;
-
-private:
-    /** Function to use for the particular tensors passed to configure() */
-    NEGEMMLowpOffsetContributionOutputStageFunction _function;
-    const ITensor                                  *_vector_sum_col;
-    const ITensor                                  *_vector_sum_row;
-    const ITensor                                  *_bias;
-    const ITensor                                  *_mm_result;
-    ITensor                                        *_output;
-    int32_t                                         _a_offset;
-    int32_t                                         _b_offset;
-    int32_t                                         _k_offset;
-    bool                                            _slide_vector_sum_col;
-    GEMMLowpOutputStageInfo                         _output_stage;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index b4a1419c9b..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ScaleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ScaleKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ScaleKernel(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]  output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ScaleKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ScaleKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr        _func;
-    const ITensor                 *_input;
-    const ITensor                 *_bias;
-    ITensor                       *_output;
-    const GEMMLowpOutputStageInfo *_output_stage;
-    bool                           _is_bounded_relu;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
deleted file mode 100644
index 0806bd1df5..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor info. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
deleted file mode 100644
index 2b3657c728..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _result_offset_after_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
deleted file mode 100644
index 2f099a3ebb..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _result_offset_after_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
deleted file mode 100644
index 1e472f5252..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all NEON reduction kernels */
-class INEGEMMLowpReductionKernel : public INEKernel
-{
-public:
-    /** Constructor */
-    INEGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ITensor *_input;
-    ITensor       *_output;
-    int32_t        _k;
-    bool           _is_reshaped;
-    int32_t        _scalar;
-    bool           _mul_by_scalar;
-};
-
-/** NEON kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixAReductionKernel";
-    }
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            (num_mtx_a_cols) Number of matrix A columns
-     *                            - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced column must be multiplied by a scalar value.
-     */
-    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            (num_mtx_a_cols) Number of matrix A columns
-     *                           - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
-     *                           - scalar       Scalar value to multiply each reduced row by.
-     *                           - mul_byscalar True if each reduced column must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] window Execution window
-     */
-    template <typename T>
-    void run_internal(const Window &window);
-};
-
-/** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixBReductionKernel";
-    }
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            (num_mtx_b_rows) Number of matrix B rows.
-     *                            - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced row must be multiplied by a scalar value.
-     */
-    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            (num_mtx_b_rows) Number of matrix B rows.
-     *                           - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
-     *                           - scalar       Scalar value to multiply each reduced row by.
-     *                           - mul_byscalar True if each reduced row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] window Execution window
-     * @param[in] info   Thread-related information
-     */
-    template <typename T>
-    void run_internal(const Window &window, const ThreadInfo &info);
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index a3ba57e4ab..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-/** NEON kernel to add a bias to each row of the input tensor */
-class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixAccumulateBiasesKernel";
-    }
-    /** Default constructor */
-    NEGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMMatrixAccumulateBiasesKernel() = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
-     */
-    void configure(ITensor *accum, const ITensor *biases);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel
-     *
-     * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
-     * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *accum, const ITensorInfo *biases);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor       *_accum;
-    const ITensor *_biases;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
deleted file mode 100644
index e528c59d8f..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
- *
- * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
- *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel
- *        - MTX_1 = C
- */
-class NEGEMMMatrixAdditionKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixAdditionKernel";
-    }
-    /** Constructor */
-    NEGEMMMatrixAdditionKernel();
-    /** Prevent instances of this class from being copied */
-    NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
-     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
-     * @param[in]      beta   Weight of matrix C
-     */
-    void configure(const ITensor *input, ITensor *output, float beta);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAdditionKernel.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in] input  Input tensor info (Matrix C). Data types supported: F16/F32
-     * @param[in] output Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
-     * @param[in] beta   Weight of matrix C
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the matrix addition functions
-     *
-     * @param[in]  input  An input tensor. Data types supported: F16/F32
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
-     * @param[in]  beta   Weight of matrix C
-     */
-    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
-    /** Matrix addition function to use for the particular tensor types passed to configure() */
-    MatrixAdditionFunction *_func;
-    float                   _beta;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 841e08d0ef..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
- *
- * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
- * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
- *
- */
-class NEGEMMMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixMultiplyKernel";
-    }
-    /** Constructor */
-    NEGEMMMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     *       These two kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in]  input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                            If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[out] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  alpha          Weight of the matrix product
-     * @param[in]  is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in] input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                           If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[in] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in] alpha          Weight of the matrix product
-     * @param[in] is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     * @param[in] reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    float          _alpha;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
deleted file mode 100644
index f5635dd58c..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
-#define ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the GEMM matrix vector multiply kernel. **/
-class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixVectorMultiplyKernel";
-    }
-    /** Default constructor */
-    NEGEMMMatrixVectorMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixVectorMultiplyKernel(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixVectorMultiplyKernel &operator=(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixVectorMultiplyKernel(NEGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel
-     *
-     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Template function to run the matrix vector multiplication
-     *
-     * @tparam I0 Input 0 type
-     * @tparam I1 Input 1 type
-     * @tparam O  Output type
-     *
-     * @param[in] window_in  Input region. (Must be a valid region of the window returned by window()).
-     * @param[in] window_w   Weights region. (Must be a valid region of the window returned by window()).
-     * @param[in] window_out Output region.(Must be a valid region of the window returned by window()).
-     */
-    template <typename I0, typename I1, typename O>
-    void matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out);
-    /** Common signature for all the specialised matrix vector multiplication functions */
-    using GEMMMatrixVectorMultiplyFunctionPtr = void (NEGEMMMatrixVectorMultiplyKernel::*)(const Window &window_in,
-                                                                                           const Window &window_w,
-                                                                                           const Window &window_out);
-
-private:
-    GEMMMatrixVectorMultiplyFunctionPtr _func;
-    const ITensor                      *_input0;
-    const ITensor                      *_input1;
-    ITensor                            *_output;
-    BorderSize                          _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
deleted file mode 100644
index 967a1b73dc..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
-#define ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
- *
- * Following an example of how the transposition1xW works when the input data is F32
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * Following an example of how the transposition1xW works when the input data type is F16
- *
- * @f[
- * \left( \begin{array}{cccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\
- * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\
- * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\
- * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
- * \end{array} \right)
- * @f]
- *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
- *
- */
-class NEGEMMTranspose1xWKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMTranspose1xWKernel";
-    }
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGatherKernel.h b/arm_compute/core/NEON/kernels/NEGatherKernel.h
deleted file mode 100644
index bfef40b53b..0000000000
--- a/arm_compute/core/NEON/kernels/NEGatherKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEGATHERKERNEL_H
-#define ARM_COMPUTE_NEGATHERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Kernel to perform other operation on NEON */
-class NEGatherKernel : public INEKernel
-{
-public:
-    /** Default constructor. */
-    NEGatherKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEGatherKernel(const NEGatherKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEGatherKernel &operator=(const NEGatherKernel &) = delete;
-    /** Allow instances of this class to be moved. */
-    NEGatherKernel(NEGatherKernel &&) = default;
-    /** Allow instances of this class to be moved. */
-    NEGatherKernel &operator=(NEGatherKernel &&) = default;
-    /** Default detructor */
-    ~NEGatherKernel() = default;
-
-    /** Name of the kernel
-     *
-     * @return Kernel name
-     */
-    const char *name() const override
-    {
-        return "NEGatherKernel";
-    }
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input
-     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     */
-    void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel
-     *
-     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Implementation of the gather operation for 0 axis.
-     *
-     * For gather on the 0 axis an element by element copy is performed.
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
-     * @param[in] info   Info about executing thread and CPU.
-     */
-    template <typename U>
-    void gather_0_axis(const Window &window, const ThreadInfo &info);
-
-    /** Implementation of the gather operation.
-     *
-     * For 1<=axis a row-wise copy is taking place.
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
-     * @param[in] info   Info about executing thread and CPU.
-     */
-    template <typename U>
-    void gather_n_axis(const Window &window, const ThreadInfo &info);
-
-    using kernel_ptr = void (NEGatherKernel::*)(const Window &window, const ThreadInfo &info);
-
-    const ITensor *_input;
-    const ITensor *_indices;
-    int            _axis;
-    ITensor       *_output;
-    kernel_ptr     _func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
deleted file mode 100644
index fa92eef1b7..0000000000
--- a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
-#define ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a Gaussian 3x3 filter */
-class NEGaussian3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian3x3Kernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: S16
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
deleted file mode 100644
index 5e63e5136f..0000000000
--- a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
-#define ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a Gaussian 5x5 filter (horizontal pass) */
-class NEGaussian5x5HorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian5x5HorKernel";
-    }
-    /** Default constructor */
-    NEGaussian5x5HorKernel();
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size;
-};
-
-/** NEON kernel to perform a Gaussian 5x5 filter (vertical pass) */
-class NEGaussian5x5VertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian5x5VertKernel";
-    }
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: S16.
-     * @param[out] output           Destination tensor, Data type supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
deleted file mode 100644
index 4700325b5f..0000000000
--- a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
-#define ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a GaussianPyramid (horizontal pass) */
-class NEGaussianPyramidHorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussianPyramidHorKernel";
-    }
-    /** Default constructor */
-    NEGaussianPyramidHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussianPyramidHorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[out] output Destination tensor. Output should have half the input width. Data type supported: S16.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    int _l2_load_offset;
-};
-
-/** NEON kernel to perform a GaussianPyramid (vertical pass) */
-class NEGaussianPyramidVertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussianPyramidVertKernel";
-    }
-    /** Default constructor */
-    NEGaussianPyramidVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussianPyramidVertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: S16.
-     * @param[out] output Destination tensor. Output should have half the input height. Data type supported: U8.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    int _t2_load_offset;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
deleted file mode 100644
index 382ce54518..0000000000
--- a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
-#define ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for Compute All Anchors kernel */
-class NEComputeAllAnchorsKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEComputeAllAnchorsKernel";
-    }
-
-    /** Default constructor */
-    NEComputeAllAnchorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComputeAllAnchorsKernel(const NEComputeAllAnchorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComputeAllAnchorsKernel &operator=(const NEComputeAllAnchorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEComputeAllAnchorsKernel(NEComputeAllAnchorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEComputeAllAnchorsKernel &operator=(NEComputeAllAnchorsKernel &&) = default;
-    /** Default destructor */
-    ~NEComputeAllAnchorsKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComputeAllAnchorsKernel
-     *
-     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    void internal_run(const Window &window);
-
-    const ITensor     *_anchors;
-    ITensor           *_all_anchors;
-    ComputeAnchorsInfo _anchors_info;
-};
-} // arm_compute
-#endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
deleted file mode 100644
index edb2da58e2..0000000000
--- a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
-#define ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
-
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform HOG Orientation Binning */
-class NEHOGOrientationBinningKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGOrientationBinningKernel";
-    }
-    /** Default constructor */
-    NEHOGOrientationBinningKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default;
-    /** Default destructor */
-    ~NEHOGOrientationBinningKernel() = default;
-
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised block normalization functions
-     *
-     * @param[in]  mag_row_ptr   Pointer to the first row of the cell in the magnitude tensor
-     * @param[in]  phase_row_ptr Pointer to the first row of the cell in the phase tensor
-     * @param[out] output_ptr    Pointer to the output cell of hog space tensor
-     * @param[in]  mag_stride    Stride of the magnitude tensor
-     * @param[in]  phase_stride  Stride of the phase tensor
-     * @param[in]  cell_width    Width of the cell
-     * @param[in]  cell_height   Height of the cell
-     * @param[in]  num_bins      Number of bins for each cell
-     * @param[in]  phase_scale   Scale factor to apply to the phase in order to calculate the histogram index
-     */
-    using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
-                               size_t cell_height, size_t num_bins, float phase_scale);
-    /** Orientation binning function to use for the particular cell width passed to configure() */
-    OrientBinFunc *_func;
-    const ITensor *_input_magnitude;
-    const ITensor *_input_phase;
-    ITensor       *_output;
-    size_t         _cell_width;
-    size_t         _cell_height;
-    size_t         _num_bins;
-    float          _phase_scale;
-};
-
-/** NEON kernel to perform HOG block normalization */
-class NEHOGBlockNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGBlockNormalizationKernel";
-    }
-    /** Default constructor */
-    NEHOGBlockNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEHOGBlockNormalizationKernel() = default;
-
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info HOG's metadata
-     */
-    void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised block normalization functions
-     *
-     * @param[in]  input_row_ptr              Pointer to the first row of the block in the input hog space tensor
-     * @param[out] output_ptr                 Pointer to the output block of the hog normalized space
-     * @param[in]  input_stride               Stride of the input hog space tensor
-     * @param[in]  num_cells_per_block_height Number of cells per block along the Y direction
-     * @param[in]  num_bins_block_x           Number of bins per block along the X direction
-     * @param[in]  num_bins_block             Number of total bins per block
-     * @param[in]  l2_hyst_threshold          Threshold to use for l2 hysteresis normalization
-     */
-    using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
-                               float l2_hyst_threshold);
-    /** Block normalization function to use for the particular normalization type passed to configure() */
-    BlockNormFunc *_func;
-    const ITensor *_input;
-    ITensor       *_output;
-    Size2D         _num_cells_per_block;
-    Size2D         _num_cells_per_block_stride;
-    size_t         _num_bins;
-    float          _l2_hyst_threshold;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
deleted file mode 100644
index acb35923d4..0000000000
--- a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDETECTORKERNEL_H
-#define ARM_COMPUTE_NEHOGDETECTORKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform HOG detector kernel using linear SVM */
-class NEHOGDetectorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGDetectorKernel";
-    }
-    /** Default constructor */
-    NEHOGDetectorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHOGDetectorKernel(NEHOGDetectorKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = delete;
-    /** Default destructor */
-    ~NEHOGDetectorKernel() = default;
-
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref NEHOGOrientationBinningKernel and  @ref NEHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor         *_input;
-    IDetectionWindowArray *_detection_windows;
-    const float           *_hog_descriptor;
-    float                  _bias;
-    float                  _threshold;
-    uint16_t               _idx_class;
-    size_t                 _num_bins_per_descriptor_x;
-    size_t                 _num_blocks_per_descriptor_y;
-    size_t                 _block_stride_width;
-    size_t                 _block_stride_height;
-    size_t                 _detection_window_width;
-    size_t                 _detection_window_height;
-    size_t                 _max_num_detection_windows;
-    arm_compute::Mutex     _mutex;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHOGDETECTORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
deleted file mode 100644
index a77fe16ac2..0000000000
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
-#define ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
-
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Common interface for all Harris Score kernels */
-class INEHarrisScoreKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    INEHarrisScoreKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default;
-    /** Default destructor */
-    ~INEHarrisScoreKernel() = default;
-
-public:
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
-     * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
-     * @param[out] output           Destination image (harris score). Data types supported: F32
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0;
-
-protected:
-    const IImage *_input1;          /**< Source image - Gx component */
-    const IImage *_input2;          /**< Source image - Gy component */
-    IImage       *_output;          /**< Source image - Harris score */
-    float         _sensitivity;     /**< Sensitivity value */
-    float         _strength_thresh; /**< Threshold value */
-    float         _norm_factor;     /**< Normalization factor */
-    BorderSize    _border_size;     /**< Border size */
-};
-
-/** Template NEON kernel to perform Harris Score.
- *  The implementation supports 3, 5, and 7 for the block_size
- */
-template <int32_t block_size>
-class NEHarrisScoreKernel : public INEHarrisScoreKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHarrisScoreKernel";
-    }
-    /** Default constructor */
-    NEHarrisScoreKernel();
-    // Inherited methods overridden:
-    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
-    BorderSize border_size() const override;
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised harris score functions */
-    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                     float norm_factor, float sensitivity, float strength_thresh);
-    /** Harris Score function to use for the particular image types passed to configure() */
-    HarrisScoreFunction *_func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHARRISCORNERSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
deleted file mode 100644
index be81f2e963..0000000000
--- a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the height concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEHeightConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHeightConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEHeightConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHeightConcatenateLayerKernel(const NEHeightConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHeightConcatenateLayerKernel &operator=(const NEHeightConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHeightConcatenateLayerKernel(NEHeightConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHeightConcatenateLayerKernel &operator=(NEHeightConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEHeightConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input         Input tensor. Data types supported: All
-     * @param[in]     height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in,out] output        Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ITensor *input, unsigned int height_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEHeightConcatenateLayerKernel
-     *
-     * @param[in] input         Input tensor info. Data types supported: All
-     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _height_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
deleted file mode 100644
index b1dd105676..0000000000
--- a/arm_compute/core/NEON/kernels/NEHistogramKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHISTOGRAMKERNEL_H
-#define ARM_COMPUTE_NEHISTOGRAMKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class IDistribution1D;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the histogram kernel */
-class NEHistogramKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHistogramKernel";
-    }
-    /** Default constructor */
-    NEHistogramKernel();
-    /** Default destructor */
-    ~NEHistogramKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramKernel(const NEHistogramKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramKernel &operator=(const NEHistogramKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHistogramKernel(NEHistogramKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHistogramKernel &operator=(NEHistogramKernel &&) = delete;
-
-    /** Set the input image and the distribution output.
-     *
-     * @param[in]     input      Source image. Data type supported: U8.
-     * @param[out]    output     Destination distribution.
-     * @param[in,out] local_hist Array that the threads use to save their local histograms.
-     *                           It's size should be equal to (number_of_threads * num_bins),
-     *                           and the Window::thread_id() is used to determine the part of the array
-     *                           used by each thread.
-     * @param[out]    window_lut LUT with pre-calculated possible window values.
-     *                           The size of the LUT should be equal to max_range_size and it will be filled
-     *                           during the configure stage, while it re-used in every run, therefore can be
-     *                           safely shared among threads.
-     */
-    void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut);
-    /** Set the input image and the distribution output.
-     *
-     * @note Used for histogram of fixed size equal to 256
-     *
-     * @param[in]  input  Source image. Data type supported: U8.
-     * @param[out] output Destination distribution which must be of 256 bins..
-     */
-    void configure(const IImage *input, IDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to merge multiple partial histograms.
-     *
-     * @param[out] global_hist Pointer to the final histogram.
-     * @param[in]  local_hist  Pointer to the partial histograms.
-     * @param[in]  bins        Number of bins.
-     */
-    void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
-    /** Function to merge multiple minimum values of partial histograms.
-     *
-     * @param[out] global_min Pointer to the global min value.
-     * @param[in]  local_min  Local min value.
-     */
-    void merge_min(uint8_t *global_min, const uint8_t &local_min);
-    /** Function to perform histogram on the given window
-     *
-     * @param[in] win  Region on which to execute the kernel
-     * @param[in] info Info about the executing thread
-     */
-    void histogram_U8(Window win, const ThreadInfo &info);
-    /** Function to perform histogram on the given window where histogram is
-     *         of fixed size 256 without ranges and offsets.
-     *
-     * @param[in] win  Region on which to execute the kernel
-     * @param[in] info Info about the executing thread
-     */
-    void histogram_fixed_U8(Window win, const ThreadInfo &info);
-    /** Pre-calculate the pixel windowing for every possible pixel
-     *
-     * Calculate (V - offset) * numBins / range where V is every possible pixel value.
-     *
-     * @note We currently support U8 image thus possible pixel values are between 0 and 255
-     */
-    void calculate_window_lut() const;
-    /** Common signature for all the specialised Histogram functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window, const ThreadInfo &info);
-
-    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
-    const IImage                 *_input;
-    IDistribution1D              *_output;
-    uint32_t                     *_local_hist;
-    uint32_t                     *_window_lut;
-    arm_compute::Mutex            _hist_mtx;
-    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEHISTOGRAMKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
deleted file mode 100644
index 1c358b379d..0000000000
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEIM2COLKERNEL_H
-#define ARM_COMPUTE_NEIM2COLKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-class Size2D;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class NEIm2ColKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEIm2ColKernel";
-    }
-    /** Default constructor */
-    NEIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel(NEIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
-    /** Default destructor */
-    ~NEIm2ColKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8 works only for has_bias = false
-     * @param[out] output      The output tensor. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs.
-     *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8 works only for has_bias = false
-     * @param[in] output      The output tensor. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run im2col
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool has_pads, bool is_nchw>
-    void run_im2col(const Window &window);
-
-    /** Common signature for all the specialised im2col functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
-
-    Im2ColFunctionPtr _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _kernel_width;
-    unsigned int  _kernel_height;
-    bool          _has_bias;
-    Size2D        _dilation;
-    DataLayout    _data_layout;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEIM2COLKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
deleted file mode 100644
index 7c14e409c6..0000000000
--- a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for performing an instance normalization */
-class NEInstanceNormalizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEInstanceNormalizationLayerKernel";
-    }
-    /** Default constructor */
-    NEInstanceNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEInstanceNormalizationLayerKernel(const NEInstanceNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEInstanceNormalizationLayerKernel &operator=(const NEInstanceNormalizationLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEInstanceNormalizationLayerKernel(NEInstanceNormalizationLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEInstanceNormalizationLayerKernel &operator=(NEInstanceNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEInstanceNormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported: NCHW
-     *                         In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-     */
-    void configure(ITensor *input, ITensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEInstanceNormalizationLayer.
-     *
-     * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported: NCHW
-     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialized instance normalization functions
-     *
-     * @param[in, out] input   An input tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output  The output tensor.
-     * @param[in]      gamma   The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
-     */
-    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
-
-    NormalizationFunction *_func;
-    ITensor               *_input;
-    ITensor               *_output;
-    float                  _gamma;
-    float                  _beta;
-    float                  _epsilon;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
deleted file mode 100644
index 77ae7b9efa..0000000000
--- a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
-#define ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform an image integral on an image */
-class NEIntegralImageKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEIntegralImageKernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8
-     * @param[out] output Destination tensor. Data type supported: U32
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-    bool       is_parallelisable() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
deleted file mode 100644
index 3937bf0163..0000000000
--- a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
-#define ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
-class NEL2NormalizeLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEL2NormalizeLayerKernel";
-    }
-    /** Default constructor */
-    NEL2NormalizeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEL2NormalizeLayerKernel(const NEL2NormalizeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEL2NormalizeLayerKernel &operator=(const NEL2NormalizeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEL2NormalizeLayerKernel(NEL2NormalizeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEL2NormalizeLayerKernel &operator=(NEL2NormalizeLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEL2NormalizeLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input   Source tensor. Data types supported: F16/F32.
-     * @param[in]  sum     Sum values tensor. Data types supported: same as @p input.
-     *                     Sum will have the same number of dimensions as input.
-     * @param[out] output  Destination tensor. Data types and data layouts supported: same as @p input.
-     *                     Output will have the same number of dimensions as input.
-     * @param[in]  axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in]  epsilon Lower bound value for the normalization.
-     */
-    void configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEL2NormalizeLayerKernel.
-     *
-     * @param[in] input   Source tensor info. Data types supported: F16/F32.
-     * @param[in] sum     Sum values tensor info. Data types supported: same as @p input.
-     *                    Sum will have the same number of dimensions as input.
-     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in] axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in] epsilon Lower bound value for the normalization.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_sum;
-    ITensor       *_output;
-    unsigned int   _actual_axis;
-    float          _epsilon;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
deleted file mode 100644
index cf99bbe691..0000000000
--- a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_LKTRACKERKERNEL_H
-#define ARM_COMPUTE_LKTRACKERKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Internal keypoint class for Lucas-Kanade Optical Flow */
-struct NELKInternalKeypoint
-{
-    float x{ 0.f };                 /**< x coordinate of the keypoint */
-    float y{ 0.f };                 /**< y coordinate of the keypoint */
-    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
-};
-
-/** Interface for NEON Array of Internal Key Points. */
-using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
-
-/** Interface for the Lucas-Kanade tracker kernel */
-class NELKTrackerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NELKTrackerKernel";
-    }
-    /** Default constructor */
-    NELKTrackerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELKTrackerKernel(const NELKTrackerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELKTrackerKernel &operator=(const NELKTrackerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NELKTrackerKernel(NELKTrackerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NELKTrackerKernel &operator=(NELKTrackerKernel &&) = default;
-    /** Default destructor */
-    ~NELKTrackerKernel() = default;
-
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      input_old            Pointer to the input old tensor. Data type supported: U8
-     * @param[in]      input_new            Pointer to the input new tensor. Data type supported. U8
-     * @param[in]      old_scharr_gx        Pointer to the input scharr X tensor. Data type supported: S16
-     * @param[in]      old_scharr_gy        Pointer to the input scharr Y tensor. Data type supported: S16
-     * @param[in]      old_points           Pointer to the IKeyPointArray storing old key points
-     * @param[in]      new_points_estimates Pointer to the IKeyPointArray storing new estimates key points
-     * @param[out]     new_points           Pointer to the IKeyPointArray storing new key points
-     * @param[in, out] old_points_internal  Pointer to the array of NELKInternalKeypoint for old points
-     * @param[out]     new_points_internal  Pointer to the array of NELKInternalKeypoint for new points
-     * @param[in]      termination          The criteria to terminate the search of each keypoint.
-     * @param[in]      use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]      epsilon              The error for terminating the algorithm
-     * @param[in]      num_iterations       The maximum number of iterations before terminate the algorithm
-     * @param[in]      window_dimension     The size of the window on which to perform the algorithm
-     * @param[in]      level                The pyramid level
-     * @param[in]      num_levels           The number of pyramid levels
-     * @param[in]      pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
-                   const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
-                   INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
-                   Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
-                   size_t level, size_t num_levels, float pyramid_scale);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Initialise the array of keypoints in the provide range
-     *
-     * @param[in] start Index of first element in the keypoints array to be initialised
-     * @param[in] end   Index after last elelemnt in the keypoints array to be initialised
-     */
-    void init_keypoints(int start, int end);
-    /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y
-     *
-     * @param[in]  keypoint    Keypoint for which gradients are computed
-     * @param[out] bilinear_ix Intermediate interpolated data for X gradient
-     * @param[out] bilinear_iy Intermediate interpolated data for Y gradient
-     *
-     * @return Values A11, A12, A22
-     */
-    std::tuple<int, int, int> compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int32_t *bilinear_ix, int32_t *bilinear_iy);
-    /** Compute the vector A^T * b, i.e. -sum(I_d * I_t) for d in {x,y}
-     *
-     * @param[in] old_keypoint Old keypoint for which gradient is computed
-     * @param[in] new_keypoint New keypoint for which gradient is computed
-     * @param[in] bilinear_ix  Intermediate interpolated data for X gradient
-     * @param[in] bilinear_iy  Intermediate interpolated data for Y gradient
-     *
-     * @return Values b1, b2
-     */
-    std::pair<int, int> compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int32_t *bilinear_ix, const int32_t *bilinear_iy);
-
-    const ITensor              *_input_old;
-    const ITensor              *_input_new;
-    const ITensor              *_old_scharr_gx;
-    const ITensor              *_old_scharr_gy;
-    IKeyPointArray             *_new_points;
-    const IKeyPointArray       *_new_points_estimates;
-    const IKeyPointArray       *_old_points;
-    INELKInternalKeypointArray *_old_points_internal;
-    INELKInternalKeypointArray *_new_points_internal;
-    Termination                 _termination;
-    bool                        _use_initial_estimate;
-    float                       _pyramid_scale;
-    float                       _epsilon;
-    unsigned int                _num_iterations;
-    int                         _window_dimension;
-    unsigned int                _level;
-    unsigned int                _num_levels;
-    ValidRegion                 _valid_region;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NELKTRACKERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
deleted file mode 100644
index ad2a161296..0000000000
--- a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */
-class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NELocallyConnectedMatrixMultiplyKernel";
-    }
-    /** Default constructor */
-    NELocallyConnectedMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output
-     *
-     * @param[in]  input0 First input tensor. Data types supported: F16, F32
-     * @param[in]  input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELocallyConnectedMatrixMultiplyKernel
-     *
-     * @param[in] input0 First input tensor info. Data types supported: F16, F32
-     * @param[in] input1 Second input tensor info. Data type supported: same as @p input0
-     * @param[in] output Output tensor info. Data type supported: same as @p input0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
deleted file mode 100644
index 7ad5bf0d99..0000000000
--- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
-#define ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Template interface for the kernel to compute magnitude and phase */
-template <MagnitudeType mag_type, PhaseType phase_type>
-class NEMagnitudePhaseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMagnitudePhaseKernel";
-    }
-    /** Default constructor */
-    NEMagnitudePhaseKernel();
-    /** Destructor */
-    ~NEMagnitudePhaseKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
-    /** Default move constructor */
-    NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
-    /** Default move assignment operator */
-    NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
-
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of out1 or out2 must be set
-     *
-     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
-     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
-     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
-     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
-     */
-    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to perform magnitude on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void magnitude(const Window &window);
-    /** Function to perform phase on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void phase(const Window &window);
-    /** Function to perform magnitude and phase on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void magnitude_phase(const Window &window);
-
-private:
-    /** Common signature for all the specialised MagnitudePhase functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window);
-    /** MagnitudePhase function to use for the particular formats passed to configure() */
-    MagnitudePhaseFunctionPtr _func;
-    const ITensor            *_gx;        /**< Input gradient X */
-    const ITensor            *_gy;        /**< Input gradient Y */
-    ITensor                  *_magnitude; /**< Output - Magnitude */
-    ITensor                  *_phase;     /**< Output - Phase */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
deleted file mode 100644
index 2197e3cfbe..0000000000
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
-#define ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
-class NEMeanStdDevKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMeanStdDevKernel";
-    }
-    /** Default constructor */
-    NEMeanStdDevKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMeanStdDevKernel(NEMeanStdDevKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = delete;
-    /** Default destructor */
-    ~NEMeanStdDevKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input              Input image. Data type supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values.
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
-     */
-    void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    BorderSize border_size() const override;
-
-private:
-    const IImage      *_input;
-    float             *_mean;
-    float             *_stddev;
-    uint64_t          *_global_sum;
-    uint64_t          *_global_sum_squared;
-    arm_compute::Mutex _mtx;
-    BorderSize         _border_size;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMEANSTDDEVKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
deleted file mode 100644
index dc0455cc4c..0000000000
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_fp16.h>
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to normalize the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. */
-class NEMeanStdDevNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMeanStdDevNormalizationKernel";
-    }
-    /** Default constructor */
-    NEMeanStdDevNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevNormalizationKernel(const NEMeanStdDevNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevNormalizationKernel &operator=(const NEMeanStdDevNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMeanStdDevNormalizationKernel(NEMeanStdDevNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMeanStdDevNormalizationKernel &operator=(NEMeanStdDevNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEMeanStdDevNormalizationKernel() = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
-     *
-     * @param[in, out] input   Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
-     *                         this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[out]     output  (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in]      epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     */
-    void configure(ITensor *input, ITensor *output = nullptr, float epsilon = 1e-8f);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEMeanStdDevNormalizationKernel
-     *
-     * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
-     *                    this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[in] output  (Optional) Destination tensor info. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output = nullptr, float epsilon = 1e-8f);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Normalizes the input with respect to mean and standard deviation.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename ScalarType, int size>
-    void mean_stddev_normalization(const Window &window);
-
-    ITensor *_input;
-    ITensor *_output;
-    float    _epsilon;
-
-    using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window);
-
-    MeanStdDevNormFunction _func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
deleted file mode 100644
index 3e86860f79..0000000000
--- a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
-#define ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform a median filter on a tensor */
-class NEMedian3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMedian3x3Kernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMEDIAN3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMemsetKernel.h b/arm_compute/core/NEON/kernels/NEMemsetKernel.h
deleted file mode 100644
index b4bcd11b82..0000000000
--- a/arm_compute/core/NEON/kernels/NEMemsetKernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEMSETKERNEL_H
-#define ARM_COMPUTE_NEMEMSETKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for filling the planes of a tensor */
-class NEMemsetKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMemsetKernel";
-    }
-    /** Default constructor */
-    NEMemsetKernel();
-    /** Default destructor */
-    ~NEMemsetKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMemsetKernel(const NEMemsetKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMemsetKernel &operator=(const NEMemsetKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMemsetKernel(NEMemsetKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMemsetKernel &operator=(NEMemsetKernel &&) = default;
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in,out] tensor         Input tensor to fill. Supported data types: All
-     * @param[in]     constant_value The value used to fill the planes of the tensor
-     */
-    void configure(ITensor *tensor, const PixelValue &constant_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor   *_tensor;
-    PixelValue _constant_value;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMEMSETKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
deleted file mode 100644
index 445e12af03..0000000000
--- a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor. */
-class NEMinMaxLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxLayerKernel";
-    }
-    /** Default constructor */
-    NEMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel &operator=(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel(NEMinMaxLayerKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel &operator=(NEMinMaxLayerKernel &&) = delete;
-    /** Default destructor */
-    ~NEMinMaxLayerKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note output[0] = minimum
-     * @note output[1] = maximum
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data type supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum value for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-    /** Resets global minimum and maximum. */
-    void reset();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    void update_min_max(float *out_ptr, float min, float max);
-    const ITensor     *_input;
-    ITensor           *_output;
-    arm_compute::Mutex _mtx;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMINMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
deleted file mode 100644
index 597a093d70..0000000000
--- a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
-#define ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the kernel to perform min max search on an image. */
-class NEMinMaxKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxKernel";
-    }
-    /** Default constructor */
-    NEMinMaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxKernel(const NEMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxKernel(NEMinMaxKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxKernel &operator=(NEMinMaxKernel &&) = delete;
-    /** Default destructor */
-    ~NEMinMaxKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min   Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max   Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const IImage *input, void *min, void *max);
-    /** Resets global minimum and maximum. */
-    void reset();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Performs the min/max algorithm on U8 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_U8(Window win);
-    /** Performs the min/max algorithm on S16 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_S16(Window win);
-    /** Performs the min/max algorithm on F32 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_F32(Window win);
-    /** Common signature for all the specialised MinMax functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MinMaxFunction = void (NEMinMaxKernel::*)(Window window);
-    /** MinMax function to use for the particular image types passed to configure() */
-    MinMaxFunction _func;
-    /** Helper to update min/max values **/
-    template <typename T>
-    void update_min_max(T min, T max);
-
-    const IImage      *_input; /**< Input image. */
-    void              *_min;   /**< Minimum value. */
-    void              *_max;   /**< Maximum value. */
-    arm_compute::Mutex _mtx;   /**< Mutex used for result reduction. */
-};
-
-/** Interface for the kernel to find min max locations of an image. */
-class NEMinMaxLocationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxLocationKernel";
-    }
-    /** Default constructor */
-    NEMinMaxLocationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default;
-    /** Default destructor */
-    ~NEMinMaxLocationKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input     Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min       Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max       Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc   Array of minimum value locations.
-     * @param[out] max_loc   Array of maximum value locations.
-     * @param[out] min_count Number of minimum value encounters.
-     * @param[out] max_count Number of maximum value encounters.
-     */
-    void configure(const IImage *input, void *min, void *max,
-                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Performs the min/max location algorithm on T type images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
-    void minmax_loc(const Window &win);
-    /** Common signature for all the specialised MinMaxLoc functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window);
-    /** MinMaxLoc function to use for the particular image types passed to configure() */
-    MinMaxLocFunction _func;
-    /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */
-    template <class T, typename>
-    struct create_func_table;
-
-    const IImage        *_input;     /**< Input image. */
-    void                *_min;       /**< Minimum value. */
-    void                *_max;       /**< Maximum value. */
-    uint32_t            *_min_count; /**< Count of minimum value encounters. */
-    uint32_t            *_max_count; /**< Count of maximum value encounters. */
-    ICoordinates2DArray *_min_loc;   /**< Locations of minimum values. */
-    ICoordinates2DArray *_max_loc;   /**< Locations of maximum values. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
deleted file mode 100644
index 43594bacbf..0000000000
--- a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
-#define ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to apply a non-linear filter */
-class NENonLinearFilterKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonLinearFilterKernel";
-    }
-    /** Default constructor */
-    NENonLinearFilterKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Fill mask with the corresponding given pattern.
-     *
-     * @param[in,out] mask    Mask to be filled according to pattern
-     * @param[in]     cols    Columns (width) of mask
-     * @param[in]     rows    Rows (height) of mask
-     * @param[in]     pattern Pattern to fill the mask according to
-     */
-    void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
-    /** Apply a median filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_box(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_box(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_box(const Window &win);
-    /** Apply a median filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_cross(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_cross(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_cross(const Window &win);
-    /** Apply a median filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_disk(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_disk(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_disk(const Window &win);
-    /** Apply a non-linear filter when given mask has user-defined pattern.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void non_linear_filter_generic(const Window &win);
-
-private:
-    unsigned int            _border_width;
-    const ITensor          *_input;
-    ITensor                *_output;
-    const uint8_t          *_mask;
-    MatrixPattern           _pattern;
-    NonLinearFilterFunction _function;
-    unsigned int            _func_idx;
-    BorderSize              _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NENONLINEARFILTERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index e2ddec9a33..0000000000
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using NEON
- *
- * @note Used by @ref NEFastCorners and @ref NEHarrisCorners
- */
-class NENonMaximaSuppression3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3Kernel";
-    }
-    /** Default constructor */
-    NENonMaximaSuppression3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NENonMaximaSuppression3x3Kernel() = default;
-
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-protected:
-    /** Common signature for all the specialised non-maxima suppression 3x3 functions
-     *
-     * @param[in]  input_ptr    Pointer to the input tensor.
-     * @param[out] output_ptr   Pointer to the output tensor
-     * @param[in]  input_stride Stride of the input tensor
-     */
-    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
-
-    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
-    const ITensor          *_input;  /**< Source tensor */
-    ITensor                *_output; /**< Destination tensor */
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32
- */
-class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3FP16Kernel";
-    }
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32.
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */
-using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
deleted file mode 100644
index 4727164d00..0000000000
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the normalization layer kernel.
- */
-class NENormalizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENormalizationLayerKernel";
-    }
-    /** Default constructor */
-    NENormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENormalizationLayerKernel(const NENormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NENormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           Data type and layout supported: same as @p input.
-     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
-     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
-     *
-     * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                          and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                          Data type and layout supported: same as @p input.
-     * @param[in] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
-     * @param[in] norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform normalization depending on the given template
-     *  dimension. The second template parameter specifies whether the
-     *  normalization has to be 1D or 2D.
-     *
-     * @note Only supported normalizations are:
-     *  - 1D over X or Z
-     *  - 2D over X and Y
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
-    void normalize_float(const Window &window);
-
-    /** Common signature for all the specialised normalization functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
-
-private:
-    NormalizationFunction  _func;
-    const ITensor         *_input;
-    const ITensor         *_input_squared;
-    ITensor               *_output;
-    NormalizationLayerInfo _norm_info;
-    BorderSize             _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h b/arm_compute/core/NEON/kernels/NEPadLayerKernel.h
deleted file mode 100644
index 4cbefbd1e3..0000000000
--- a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPADLAYERKERNEL_H
-#define ARM_COMPUTE_NEPADLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to add padding to a tensor
- *
- * Add padding given padding information
- */
-class NEPadLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPadLayerKernel";
-    }
-    /** Default constructor */
-    NEPadLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPadLayerKernel(const NEPadLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPadLayerKernel &operator=(const NEPadLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPadLayerKernel(NEPadLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPadLayerKernel &operator=(NEPadLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEPadLayerKernel() = default;
-
-    /** Initialize the function
-     *
-     * @param[in]  input          Source tensor. Data types supported: All.
-     * @param[out] output         Output tensor. Data type supported: same as @p input
-     * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                            specifies the front and the end padding in the i-th dimension.
-     * @param[in]  constant_value (Optional) Constant value to be used for the padding
-     * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
-     *                           Only CONSTANT padding mode is currently supported
-     */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
-     *
-     * @param[in] input          Source tensor info. Data types supported: All.
-     * @param[in] output         Output tensor info. Data type supported: same as @p input
-     * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                           specifies the front and the end padding in the i-th dimension.
-     * @param[in] constant_value (Optional) Constant value to be used for the padding
-     * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
-     *                           Only CONSTANT padding mode is currently supported
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the padding function with constant padding
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_pad_constant(const Window &window);
-
-    /** Function to run the padding function with constant padding for 3D input and 1D, 2D, 3D padding
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    void run_pad_constant_uint8_3Dinput_3Dpad(const Window &window);
-
-    /** Common signature for all the specialised permute functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using PadFunctionPtr = void (NEPadLayerKernel::*)(const Window &window);
-
-    PadFunctionPtr _func;
-    const ITensor *_input;
-    ITensor       *_output;
-    PaddingList    _padding;
-    PixelValue     _constant_value;
-    PaddingMode    _mode;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPADLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
deleted file mode 100644
index 89dc4e6fc7..0000000000
--- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPERMUTEKERNEL_H
-#define ARM_COMPUTE_NEPERMUTEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** NEON kernel to perform tensor permutation.
- *
- * Permutes given a permutation vector
- */
-class NEPermuteKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPermuteKernel";
-    }
-    /** Default constructor */
-    NEPermuteKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPermuteKernel(const NEPermuteKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPermuteKernel &operator=(const NEPermuteKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPermuteKernel(NEPermuteKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPermuteKernel &operator=(NEPermuteKernel &&) = default;
-    /** Default destructor */
-    ~NEPermuteKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in]  input  The input tensor to permute. Data types supported: All
-     * @param[out] output The output tensor. Data types supported: Same as @p input
-     * @param[in]  perm   Permutation vector
-     */
-    void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] input  The input tensor to permute. Data types supported: All
-     * @param[in] output The output tensor. Data types supported: Same as @p input
-     * @param[in] perm   Permutation vector
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the permute
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_permute(const Window &window);
-
-    /** Common signature for all the specialised permute functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using PermuteFunctionPtr = void (NEPermuteKernel::*)(const Window &window);
-
-    PermuteFunctionPtr _func;
-    const ITensor     *_input;
-    ITensor           *_output;
-    PermutationVector  _perm;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPERMUTEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 1a9dd6be2e..0000000000
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform addition between two tensors */
-class NEPixelWiseMultiplicationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPixelWiseMultiplicationKernel";
-    }
-    /** Default constructor */
-    NEPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
-    /** Default destructor */
-    ~NEPixelWiseMultiplicationKernel() = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[out] output          Output tensor. Data types supported:
-     *                             - U8, only if both inputs are U8.
-     *                             - QASYMM8, only if both inputs are QASYMM8.
-     *                             - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                             - S16.
-     *                             - QSYMM16, only if both inputs are QSYMM16.
-     *                             - S32, only if both inputs are QSYMM16.
-     *                             - F16, only if @p input1 is F16.
-     *                             - F32, only if both inputs are F32.
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
-     * @param[in]  rounding_policy Rounding policy.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
-     *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          Output tensor info. Data types supported:
-     *                            - U8, only if both inputs are U8.
-     *                            - QASYMM8, only if both inputs are QASYMM8.
-     *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                            - S16.
-     *                            - QSYMM16, only if both inputs are QSYMM16.
-     *                            - S32, only if both inputs are QSYMM16.
-     *                            - F16, only if @p input1 is F16.
-     *                            - F32, only if both inputs are F32.
-     * @param[in] scale           Scale to apply after multiplication.
-     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
-     * @param[in] rounding_policy Rounding policy.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Common signature for all the specialised multiplication functions with integer scaling factor
-     *
-     * @param[in]  input1_ptr Pointer to the first input tensor.
-     * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor.
-     * @param[in]  scale      Integer scale factor.
-     */
-    using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
-    /** Common signature for all the specialised multiplication functions with float scaling factor
-     *
-     * @param[in]  input1_ptr Pointer to the first input tensor.
-     * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor.
-     * @param[in]  scale      Float scale factor.
-     */
-    using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
-    /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
-     *
-     * @param[in]  input1_ptr      Pointer to the first input tensor.
-     * @param[in]  input2_ptr      Pointer to the second input tensor.
-     * @param[out] output_ptr      Pointer to the output tensor.
-     * @param[in]  scale           Float scale factor.
-     * @param[in]  input1_qua_info Quantization Info of tensor input1.
-     * @param[in]  input2_qua_info Quantization Info of tensor input2.
-     * @param[in]  output_qua_info Quantization Info of tensor output.
-     *
-     */
-    using MulFunctionQuantized = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
-                                      const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info);
-
-    MulFunctionFloat     *_func_float;
-    MulFunctionInt       *_func_int;
-    MulFunctionQuantized *_func_quantized;
-
-private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    float          _scale;
-    int            _scale_exponent;
-    bool           _run_optimized_qasymm8;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class NEComplexPixelWiseMultiplicationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEComplexPixelWiseMultiplicationKernel";
-    }
-    /** Default constructor.*/
-    NEComplexPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel &operator=(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel(NEComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel &operator=(NEComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in]  input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output The output tensor, Data types supported: same as @p input1.  Number of channels supported: same as @p input1.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel
-     *
-     * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-};
-
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
deleted file mode 100644
index b0574b7cf6..0000000000
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the pooling layer kernel */
-class NEPoolingLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPoolingLayerKernel";
-    }
-    /** Default constructor */
-    NEPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEPoolingLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in] input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform 2x2 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    void pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    void pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 32-bit floating point values.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 32-bit floating point values (NHWC).
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 7x7 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 3x3 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 2x2 pooling for float16_t.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 3x3 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 16-bit floating point values.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform 2x2 pooling for 8bit quantized fixed point. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform 3x3 pooling for 8bit quantized fixed point. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform MxN pooling for 8-bit quantized. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform MxN pooling for 8-bit quantized. (NHWC)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Common signature for all the specialised Pooling functions
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding);
-
-private:
-    PoolingFunction  _func;
-    const ITensor   *_input;
-    ITensor         *_output;
-    ITensor         *_indices;
-    PoolingLayerInfo _pool_info;
-    DataLayout       _data_layout;
-    unsigned int     _num_elems_processed_per_iteration;
-    BorderSize       _border_size;
-    bool             _is_square;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h b/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
deleted file mode 100644
index 6bf6574568..0000000000
--- a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
-#define ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to calculate prior boxes */
-class NEPriorBoxLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPriorBoxLayerKernel";
-    }
-    /** Default constructor */
-    NEPriorBoxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPriorBoxLayerKernel(const NEPriorBoxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPriorBoxLayerKernel &operator=(const NEPriorBoxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPriorBoxLayerKernel(NEPriorBoxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPriorBoxLayerKernel &operator=(NEPriorBoxLayerKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input2 Second source tensor. Data types and layouts supported: same as @p input1
-     * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input
-     * @param[in]  info   Prior box layer info.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPriorBoxLayerKernel
-     *
-     * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] input2 Second source tensor info. Data types and layouts supported: same as @p input1
-     * @param[in] output Destination tensor info. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input
-     * @param[in] info   Prior box layer info.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Stores the coordinates of the calculated prior boxes.
-     *
-     * @param[out] out        Output pointer.
-     * @param[in]  offset     Output offset to write to.
-     * @param[in]  center_x   Center pixel value on x-axis.
-     * @param[in]  center_y   Center pixel value on y-axis.
-     * @param[in]  box_width  Prior box width.
-     * @param[in]  box_height Prior box height.
-     * @param[in]  width      Input width.
-     * @param[in]  height     Input height.
-     */
-    void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
-    /** Function to calculate prior boxes.
-     *
-     * @param[in] window Input region on which to execute the kernel.
-     */
-    void calculate_prior_boxes(const Window &window);
-
-    const ITensor    *_input1;
-    const ITensor    *_input2;
-    ITensor          *_output;
-    PriorBoxLayerInfo _info;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
deleted file mode 100644
index f5e8da7feb..0000000000
--- a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include <functional>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform layer normalization */
-class NEQLSTMLayerNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEQLSTMLayerNormalizationKernel";
-    }
-    /** Default constructor */
-    NEQLSTMLayerNormalizationKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQLSTMLayerNormalizationKernel(const NEQLSTMLayerNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQLSTMLayerNormalizationKernel &operator=(const NEQLSTMLayerNormalizationKernel &) = delete;
-    /** Default Move Constructor. */
-    NEQLSTMLayerNormalizationKernel(NEQLSTMLayerNormalizationKernel &&) = default;
-    /** Default move assignment operator */
-    NEQLSTMLayerNormalizationKernel &operator=(NEQLSTMLayerNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEQLSTMLayerNormalizationKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QSYMM16.
-     * @param[out] output Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  weight Weight tensor. Data types supported: Same as @p input.
-     * @param[in]  bias   Bias tensor. Data types supported: S32
-     */
-    void configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayerNormalizationKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QSYMM16.
-     * @param[in] output Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] weight Weight tensor info. Data types supported: Same as @p input.
-     * @param[in] bias   Bias tensor info. Data types supported: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    // constants
-    static constexpr uint32_t max_input_dimension{ 2 };  /**< The maximum input dimension supported */
-    static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
-    static constexpr uint32_t max_bias_dimension{ 1 };   /**< The maximum bias dimension supported */
-    static constexpr uint32_t vector_size_byte{ 16 };    /**< Computation vector size in byte */
-
-    using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
-
-    ComputeFuncType _fn{}; /**< Function pointer to computation function */
-
-    const ITensor *_input{ nullptr };  /**< Input tensor */
-    const ITensor *_weight{ nullptr }; /**< Weight tensor */
-    const ITensor *_bias{ nullptr };   /**< Bias tensor */
-    ITensor       *_output{ nullptr }; /**< Output tensor */
-
-    int32_t _output_multiplier{}; /**< Multiplier for output values */
-    int32_t _output_shift{};      /**< Shift value for output values */
-
-    int32_t _window_start_x{}; /**< The beginning of x-axis iteration */
-    int32_t _window_end_x{};   /**< The end of x-axis iteration */
-    int32_t _window_step_x{};  /**< The size of x-axis iteration's step */
-
-    Window _inout_window{};  /**< Window for input and output tensor */
-    Window _weight_window{}; /**< Window for weight and bias tensor */
-
-    /** Function to configure initial windows for destination of computation
-     *
-     * @param[in] Target destination tensor to use for output window
-     *
-     * @return configured window
-     */
-    Window configure_window(ITensor *target);
-    // Function to compute for data type QSYMM16
-    void compute_qsymm16();
-    /** Function to compute summation and summation of squared input of the given input pointer
-     *
-     * @param[in] Input_ptr pointer to input array
-     *
-     */
-    std::pair<int64_t, int64_t> sum_qsymm16(const int16_t *input_ptr);
-    /** Function to normalize values using computed mean and standard deviation
-     *
-     * @param[in] input_ptr     Pointer to input array
-     * @param[in] output_ptr    Pointer to output array
-     * @param[in] weight_ptr    Pointer to weight array
-     * @param[in] bias_ptr      Pointer to bias array
-     * @param[in] mean          Mean value
-     * @param[in] inv_std_mul   Quantized multiplier for standard deviation
-     * @param[in] inv_std_shift Shift for standard deviation
-     *
-     */
-    void normalize_qasymm16(const int16_t *input_ptr,
-                            int16_t       *output_ptr,
-                            const int16_t *weight_ptr,
-                            const int32_t *bias_ptr,
-                            int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
-    /** Function to compute output quantization information */
-    QuantizationInfo compute_output_qinfo();
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
deleted file mode 100644
index 087e767b73..0000000000
--- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors
- *
- */
-class NEQuantizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEQuantizationLayerKernel";
-    }
-    /** Default constructor */
-    NEQuantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQuantizationLayerKernel(const NEQuantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQuantizationLayerKernel &operator=(const NEQuantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEQuantizationLayerKernel(NEQuantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEQuantizationLayerKernel &operator=(NEQuantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEQuantizationLayerKernel() = default;
-    /** Set the input, output.
-     *
-     * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizationFunctionExecutorPtr = void (NEQuantizationLayerKernel::*)(const Window &window);
-    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename TIn, typename TOut>
-    void run_quantize_qasymm8(const Window &window);
-    /** Function to apply QASYMM16 quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void run_quantize_qasymm16(const Window &window);
-
-    const ITensor *_input;
-    ITensor       *_output;
-
-    QuantizationFunctionExecutorPtr _func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h b/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
deleted file mode 100644
index bebcab5359..0000000000
--- a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
-#define ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the RoIAlign kernel.
- */
-class NEROIAlignLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEROIAlignLayerKernel";
-    }
-
-    /** Constructor */
-    NEROIAlignLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIAlignLayerKernel(const NEROIAlignLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIAlignLayerKernel &operator=(const NEROIAlignLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEROIAlignLayerKernel(NEROIAlignLayerKernel &&) = default;
-    /** Default move assignment operator. */
-    NEROIAlignLayerKernel &operator=(NEROIAlignLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEROIAlignLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8,
-     *                      otherwise same as @p input
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <DataLayout data_layout, typename input_data_type, typename roi_data_type = input_data_type>
-    void internal_run(const Window &window, const ThreadInfo &info);
-
-    const ITensor      *_input;
-    ITensor            *_output;
-    const ITensor      *_rois;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
deleted file mode 100644
index 59a5017711..0000000000
--- a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "arm_compute/core/IArray.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the ROI pooling layer kernel */
-class NEROIPoolingLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEROIPoolingLayerKernel";
-    }
-    /** Default constructor */
-    NEROIPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIPoolingLayerKernel(const NEROIPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIPoolingLayerKernel &operator=(const NEROIPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEROIPoolingLayerKernel(NEROIPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEROIPoolingLayerKernel &operator=(NEROIPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEROIPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as that specified by @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
-     */
-    void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor      *_input;
-    const ITensor      *_rois;
-    ITensor            *_output;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NERangeKernel.h b/arm_compute/core/NEON/kernels/NERangeKernel.h
deleted file mode 100644
index e67a5dc945..0000000000
--- a/arm_compute/core/NEON/kernels/NERangeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NERANGEKERNEL_H
-#define ARM_COMPUTE_NERANGEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel class for Range
- *
- * range generates a 1-D tensor containing a sequence of numbers that begins at 'start' and extends by increments
- * of 'step' up to but not including 'end'.
- */
-class NERangeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERangeKernel";
-    }
-    /** Default constructor */
-    NERangeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERangeKernel(const NERangeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERangeKernel &operator=(const NERangeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERangeKernel(NERangeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERangeKernel &operator=(NERangeKernel &&) = default;
-    /** Default destructor */
-    ~NERangeKernel() = default;
-    /** Initialize the kernel's output tensor, start, end and step of the sequence.
-     *
-     * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  start  The starting value of the sequence.
-     * @param[in]  end    The ending (not including) value of the sequence.
-     * @param[in]  step   The gap between each pair of values in the sequence.
-     */
-    void configure(ITensor *output, float start, float end, float step);
-    /** Static function to check if given info will lead to a valid configuration of @ref NERangeKernel
-     *
-     * @param[in] output Output tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in] start  The starting value of the sequence.
-     * @param[in] end    The ending (not including) value of the sequence.
-     * @param[in] step   The gap between each pair of values in the sequence.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *output, float start, float end, float step);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using RangeFunction = void(ITensor *output, float start, float step, const Window &window);
-
-    RangeFunction *_func;   /**< Range function to be called */
-    float          _start;  /**< Start of sequence */
-    float          _end;    /**< End of sequence */
-    float          _step;   /**< Increment/step value */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NERANGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
deleted file mode 100644
index 28cca4987b..0000000000
--- a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
-#define ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a reduction operation
- *
- * @note For ARG_MIN/ARG_MAX reduction, the default data type for an uninitialized
- *       output tensor is signed 32-bit integer (S32). It is the user's responsibility
- *       to check that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class NEReductionOperationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReductionOperationKernel";
-    }
-    /** Default constructor */
-    NEReductionOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReductionOperationKernel(const NEReductionOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReductionOperationKernel &operator=(const NEReductionOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEReductionOperationKernel(NEReductionOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEReductionOperationKernel &operator=(NEReductionOperationKernel &&) = default;
-    /** Default destructor */
-    ~NEReductionOperationKernel() = default;
-
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
-     * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
-     * @param[in]  op     Reduction operation to perform.
-     */
-    void configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperationKernel.
-     *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
-     * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
-     *                   Output will have the same number of dimensions as input.
-     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
-     * @param[in] op     Reduction operation to perform.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor     *_input;
-    ITensor           *_output;
-    unsigned int       _reduction_axis;
-    ReductionOperation _op;
-    BorderSize         _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h
deleted file mode 100644
index e929b1c5d4..0000000000
--- a/arm_compute/core/NEON/kernels/NERemapKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
-#define ARM_COMPUTE_NEREMAPKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a remap on a tensor */
-class NERemapKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERemapKernel";
-    }
-    /** Default constructor */
-    NERemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel(const NERemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel &operator=(const NERemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERemapKernel(NERemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERemapKernel &operator=(NERemapKernel &&) = default;
-    /** Default destructor */
-    ~NERemapKernel() = default;
-
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
-     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
-     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** function to perform nearest interpolation on the given window */
-    void remap_nearest(const Window &window);
-    /** function to perform bilinear interpolation on the given window */
-    void remap_bilinear(const Window &window);
-    /** Remap function to use for the particular interpolation type passed to configure() */
-    void (NERemapKernel::*_func)(const Window &window);
-
-    const ITensor *_input;  /**< Input image */
-    ITensor       *_output; /**< Output image */
-    const ITensor *_map_x;  /**< Input remap x coordinates */
-    const ITensor *_map_y;  /**< Input remap y coordinates */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
deleted file mode 100644
index 9277ddbe47..0000000000
--- a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREORGLAYERKERNEL_H
-#define ARM_COMPUTE_NEREORGLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor re-organization */
-class NEReorgLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReorgLayerKernel";
-    }
-    /** Default constructor */
-    NEReorgLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReorgLayerKernel(const NEReorgLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReorgLayerKernel &operator=(const NEReorgLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEReorgLayerKernel(NEReorgLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEReorgLayerKernel &operator=(NEReorgLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEReorgLayerKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     * @param[in]  stride Stride to be used during data re-organization.
-     *                    It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     */
-    void configure(const ITensor *input, ITensor *output, int32_t stride);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: All
-     * @param[in] output Destination tensor info. Data type supported: Same as @p input
-     * @param[in] stride Stride to be used during data re-organization
-     *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    int32_t        _stride;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREORGLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
deleted file mode 100644
index fccf2685a8..0000000000
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NERESHAPELAYERKERNEL_H
-#define ARM_COMPUTE_NERESHAPELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor reshaping */
-class NEReshapeLayerKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReshapeLayerKernel";
-    }
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: All
-     * @param[in] output Destination tensor info. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NERESHAPELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReverseKernel.h b/arm_compute/core/NEON/kernels/NEReverseKernel.h
deleted file mode 100644
index 516653b70d..0000000000
--- a/arm_compute/core/NEON/kernels/NEReverseKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREVERSEKERNEL_H
-#define ARM_COMPUTE_NEREVERSEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the reverse layer kernel. */
-class NEReverseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReverseKernel";
-    }
-    /** Default constructor */
-    NEReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReverseKernel(const NEReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReverseKernel &operator=(const NEReverseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEReverseKernel(NEReverseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEReverseKernel &operator=(NEReverseKernel &&) = default;
-    /** Default destructor */
-    ~NEReverseKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
-     */
-    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    const ITensor *_axis;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREVERSEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
deleted file mode 100644
index 9bc04129e0..0000000000
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESCALEKERNEL_H
-#define ARM_COMPUTE_NESCALEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform scaling on a tensor */
-class NEScaleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEScaleKernel";
-    }
-    /** Default constructor */
-    NEScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScaleKernel(const NEScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScaleKernel &operator=(const NEScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEScaleKernel(NEScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEScaleKernel &operator=(NEScaleKernel &&) = default;
-    /** Default destructor */
-    ~NEScaleKernel() = default;
-
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
-     *
-     * @param[in]  input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in]  dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in]  dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in]  offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info    @ref ScaleKernelInfo to use for configuration
-     */
-    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output,
-                   const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEScaleKernel
-     *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
-     *
-     * @param[in] input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in] dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in] dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[in] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] info    @ref ScaleKernelInfo to use for validation
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *output,
-                           const ScaleKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** function to perform scale using nearest interpolation on the given window */
-    void scale_nearest_nchw(const Window &window);
-    /** function to perform scale using bilinear interpolation on the given window */
-    void scale_bilinear_nchw(const Window &window);
-    /** function to perform scale using area interpolation on the given window
-     *
-     *  @note Used only in case down-sampling.
-     */
-    void scale_area_nchw(const Window &window);
-    /** function to perform scale on the given window */
-    void scale_nhwc(const Window &window);
-    /** Scale function to use for the particular interpolation type passed to configure() */
-    void (NEScaleKernel::*_func)(const Window &window);
-
-    const ITensor      *_offsets;
-    const ITensor      *_dx;
-    const ITensor      *_dy;
-    const ITensor      *_input;
-    ITensor            *_output;
-    InterpolationPolicy _policy;
-    BorderSize          _border_size;
-    BorderMode          _border_mode;
-    PixelValue          _constant_border_value;
-    float               _sampling_offset;
-    bool                _use_padding;
-    bool                _align_corners;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESCALEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
deleted file mode 100644
index 320b44d307..0000000000
--- a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESCHARR3x3KERNEL_H
-#define ARM_COMPUTE_NESCHARR3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
- *
-* @f[
-*      \mathbf{G}_x=\begin{vmatrix}
-*      -3 & 0 & +3\\
-*      -10& 0 & +10\\
-*      -3 & 0 & +3
-*      \end{vmatrix}
-* @f]
-*/
-class NEScharr3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEScharr3x3Kernel";
-    }
-    /** Default constructor */
-    NEScharr3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NEScharr3x3Kernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    bool           _run_scharr_x; /**< Do we need to run Scharr X ? */
-    bool           _run_scharr_y; /**< Do we need to run Scharr Y ? */
-    const ITensor *_input;        /**< Input tensor */
-    ITensor       *_output_x;     /**< Output tensor for scharr X */
-    ITensor       *_output_y;     /**< Output tensor for scharr Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESCHARR3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESelectKernel.h b/arm_compute/core/NEON/kernels/NESelectKernel.h
deleted file mode 100644
index 51c8543ddc..0000000000
--- a/arm_compute/core/NEON/kernels/NESelectKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESELECTKERNEL_H
-#define ARM_COMPUTE_NESELECTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the select kernel
- *
- * Select is computed by:
- * @f[ output(i) = condition(i) ? x(i) : y(i) @f]
- *
- */
-class NESelectKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESelectKernel";
-    }
-    /** Default constructor */
-    NESelectKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESelectKernel(const NESelectKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESelectKernel &operator=(const NESelectKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESelectKernel(NESelectKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESelectKernel &operator=(NESelectKernel &&) = default;
-    /** Default destructor */
-    ~NESelectKernel() = default;
-
-    /** Common signature for all the specialised elementwise functions
-     *
-     * @param[in]  c      Condition input tensor. Data types supported: U8.
-     * @param[in]  x      First input tensor. Data types supported: All.
-     * @param[out] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in]  output Output tensor. Data types supported: Same as @p x
-     */
-    void configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output);
-
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised select functions
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     */
-    using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window);
-
-    /** Select function to use for the particular tensor types passed to configure() */
-    SelectFunction *_function;
-    const ITensor *_c;              /**< Condition tensor */
-    const ITensor *_x;              /**< Source tensor 1 */
-    const ITensor *_y;              /**< Source tensor 2 */
-    ITensor        *_output;        /**< Destination tensor */
-    bool            _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESELECTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
deleted file mode 100644
index ef0db2a428..0000000000
--- a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL3x3KERNEL_H
-#define ARM_COMPUTE_NESOBEL3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor.
- *
- * @f[
- *      \mathbf{G}_x=\begin{vmatrix}
- *      -1 & 0 & +1\\
- *      -2 & 0 & +2\\
- *      -1 & 0 & +1
- *      \end{vmatrix}
- * @f]
-*/
-class NESobel3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel3x3Kernel";
-    }
-    /** Default constructor */
-    NESobel3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel3x3Kernel(const NESobel3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel3x3Kernel(NESobel3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NESobel3x3Kernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    bool           _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< Output tensor for sobel X */
-    ITensor       *_output_y;    /**< Output tensor for sobel Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
deleted file mode 100644
index bc0cfb016e..0000000000
--- a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL5x5KERNEL_H
-#define ARM_COMPUTE_NESOBEL5x5KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor.
- *
- */
-class NESobel5x5HorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel5x5HorKernel";
-    }
-    /** Default constructor */
-    NESobel5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~NESobel5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< X output of horizontal pass */
-    ITensor       *_output_y;    /**< Y output of horizontal pass */
-    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
-    BorderSize     _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor.
- *
-*/
-class NESobel5x5VertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel5x5VertKernel";
-    }
-    /** Default constructor */
-    NESobel5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~NESobel5x5VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input_x          Input for X (X output of hor pass). Data type supported: S16.
-     * @param[in]  input_y          Input for Y (Y output of hor pass). Data type supported: S16.
-     * @param[out] output_x         Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    ITensor *_input_x;     /**< X input (X output of the hor pass) */
-    ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
-    ITensor *_output_x;    /**< X output of sobel */
-    ITensor *_output_y;    /**< Y output of sobel */
-    bool     _run_sobel_x; /**< Do we need to run sobel X? */
-    bool     _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL5x5KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
deleted file mode 100644
index 468a94d0d1..0000000000
--- a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL7x7KERNEL_H
-#define ARM_COMPUTE_NESOBEL7x7KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor.
- *
- */
-class NESobel7x7HorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel7x7HorKernel";
-    }
-    /** Default constructor */
-    NESobel7x7HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default;
-    /** Default destructor */
-    ~NESobel7x7HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< X output of horizontal pass */
-    ITensor       *_output_y;    /**< Y output of horizontal pass */
-    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
-    BorderSize     _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor.
- *
-*/
-class NESobel7x7VertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel7x7VertKernel";
-    }
-    /** Default constructor */
-    NESobel7x7VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default;
-    /** Default destructor */
-    ~NESobel7x7VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set
-     * @note If output_x is set then input_x must be set too
-     * @note If output_y is set then input_y must be set too
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of hor pass). Data type supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of hor pass). Data type supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input_x;     /**< X input (X output of the hor pass) */
-    const ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
-    ITensor       *_output_x;    /**< X output of sobel */
-    ITensor       *_output_y;    /**< Y output of sobel */
-    bool           _run_sobel_x; /**< Do we need to run sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL7x7KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
deleted file mode 100644
index 0e0be7936b..0000000000
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the identifying the max value of 1D Logits */
-class NELogits1DMaxKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NELogits1DMaxKernel";
-    }
-    /** Default constructor */
-    NELogits1DMaxKernel();
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    using Logits1DMaxFunction = void(const ITensor &in, ITensor &out, const Window &window);
-
-private:
-    Logits1DMaxFunction *_func;
-    BorderSize           _border_size;
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class NELogits1DSoftmaxKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        if(IS_LOG)
-        {
-            return "NELogits1DSoftmaxKernel";
-        }
-        else
-        {
-            return "NELogits1DLogSoftmaxKernel";
-        }
-    }
-    /** Default constructor */
-    NELogits1DSoftmaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DSoftmaxKernel(const NELogits1DSoftmaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DSoftmaxKernel &operator=(const NELogits1DSoftmaxKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NELogits1DSoftmaxKernel(NELogits1DSoftmaxKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NELogits1DSoftmaxKernel &operator=(NELogits1DSoftmaxKernel &&) = default;
-    /** Default destructor */
-    ~NELogits1DSoftmaxKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  max    Max values tensor. Same shape as input with dimension 0 set to 1.
-     *                    Data types supported: same as @p input.
-     * @param[out] output Destination tensor. Data types supported: same as @p input.
-     * @param[in]  beta   A scaling factor for the exponent.
-     *
-     * @param      tmp    Auxiliary tensor. Must be type F32 and same shape as the input.
-     */
-    void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] max    Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                   Data types supported: same as @p input.
-     * @param[in] output Destination tensor info. Data types supported: same as @p input.
-     * @param[in] beta   A scaling factor for the exponent.
-     * @param[in] tmp    Tensor info of auxiliary. Must be type F32 and same shape as the input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max,
-                           const ITensorInfo *output, const float beta, const ITensorInfo *tmp);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using LogitsSoftmaxFunction = void(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta,
-                                       const Window &window);
-
-    LogitsSoftmaxFunction *_func;
-    const ITensor         *_input;
-    const ITensor         *_max;
-    ITensor               *_output;
-    float                  _beta;
-    ITensor               *_tmp; //Temporary. Used internally
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
deleted file mode 100644
index 532fbb2852..0000000000
--- a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
-#define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declaration
-class ITensor;
-
-/** Interface for the space to batch kernel */
-class NESpaceToBatchLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESpaceToBatchLayerKernel";
-    }
-    /** Default constructor */
-    NESpaceToBatchLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToBatchLayerKernel(const NESpaceToBatchLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToBatchLayerKernel &operator=(const NESpaceToBatchLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESpaceToBatchLayerKernel(NESpaceToBatchLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESpaceToBatchLayerKernel &operator=(NESpaceToBatchLayerKernel &&) = default;
-    /** Default destructor */
-    ~NESpaceToBatchLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output);
-    /** Initialise the kernel's input and output. (Static block shape and paddings)
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    const ITensor *_block_shape; /**< Block shape tensor */
-    const ITensor *_paddings;    /**< Paddings tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    DataLayout     _data_layout; /**< Data layout to be used at run-time */
-
-    Size2D _padding_left;
-    int    _block_shape_x;
-    int    _block_shape_y;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
deleted file mode 100644
index e0c22e65fb..0000000000
--- a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
-#define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the space to depth kernel */
-class NESpaceToDepthLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESpaceToDepthLayerKernel";
-    }
-    /** Default constructor */
-    NESpaceToDepthLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToDepthLayerKernel(const NESpaceToDepthLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToDepthLayerKernel &operator=(const NESpaceToDepthLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESpaceToDepthLayerKernel(NESpaceToDepthLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESpaceToDepthLayerKernel &operator=(NESpaceToDepthLayerKernel &&) = default;
-    /** Default destructor */
-    ~NESpaceToDepthLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape value
-     */
-    void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToDepthLayerKernel
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    int32_t        _block_shape; /**< Block shape */
-    DataLayout     _data_layout; /**< Data layout  of the operation */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
deleted file mode 100644
index c4dc53eac6..0000000000
--- a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
-#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
-class NEStackLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEStackLayerKernel";
-    }
-    /** Default constructor */
-    NEStackLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStackLayerKernel(const NEStackLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStackLayerKernel &operator=(const NEStackLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEStackLayerKernel(NEStackLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEStackLayerKernel &operator=(NEStackLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEStackLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in]  input       Input tensor. Data types supported: All
-     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
-     *                         All tensors in the list must have the same shape
-     * @param[in]  num_tensors Number of tensors to stack
-     * @param[out] output      Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in] input       Input tensor info. Data types supported: All
-     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
-     *                        All tensors in the list must have the same shape
-     * @param[in] num_tensors Number of tensors to stack
-     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
-
-    // Inherited methods overridden
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _axis;
-    unsigned int   _idx_input;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h b/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
deleted file mode 100644
index 6709619a62..0000000000
--- a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
-#define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor strided slicing */
-class NEStridedSliceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEStridedSliceKernel";
-    }
-    /** Default constructor */
-    NEStridedSliceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStridedSliceKernel(const NEStridedSliceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStridedSliceKernel &operator=(const NEStridedSliceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEStridedSliceKernel(NEStridedSliceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEStridedSliceKernel &operator=(NEStridedSliceKernel &&) = default;
-    /** Default destructor */
-    ~NEStridedSliceKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  input            Source tensor. Data type supported: All
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const ITensor *input, ITensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in] input            Source tensor info. Data type supported: All
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
-     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;         /**< Source tensor */
-    ITensor       *_output;        /**< Destination tensor */
-    Coordinates    _starts_abs;    /**< Absolute start coordinates */
-    Coordinates    _final_strides; /**< Final strides */
-    int32_t        _shrink_mask;   /**< Shrink axis mask */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
deleted file mode 100644
index 13a76cb40e..0000000000
--- a/arm_compute/core/NEON/kernels/NETableLookupKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETABLELOOKUPKERNEL_H
-#define ARM_COMPUTE_NETABLELOOKUPKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-class ILut;
-
-/** Interface for the kernel to perform table lookup calculations. */
-class NETableLookupKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NETableLookupKernel";
-    }
-    /** Default constructor */
-    NETableLookupKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETableLookupKernel(const NETableLookupKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETableLookupKernel &operator=(const NETableLookupKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NETableLookupKernel(NETableLookupKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8/S16.
-     * @param[in]  lut    The input LUT.
-     * @param[out] output The output tensor. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ILut *lut, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Perform table lookup on a given window.
-     *
-     * @param window window Region on which to execute the kernel.
-     */
-    template <class T>
-    void tableLookup(const Window &window);
-    /** Common signature for all the specialised lut functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window);
-    /** Sub function to use for the particular tensor types passed to configure() */
-    TableLookupFunction _func;
-    const ILut         *_lut;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NETABLELOOKUPKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
deleted file mode 100644
index a6d1e9071c..0000000000
--- a/arm_compute/core/NEON/kernels/NEThresholdKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETHRESHOLDKERNEL_H
-#define ARM_COMPUTE_NETHRESHOLDKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the thresholding kernel
- *
- */
-class NEThresholdKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEThresholdKernel";
-    }
-    /** Constructor
-     * Initialize all the pointers to nullptr and parameters to zero.
-     */
-    NEThresholdKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEThresholdKernel(const NEThresholdKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
-    /** Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  input       An input tensor. Data type supported: U8
-     * @param[out] output      The output tensor. Data type supported: U8.
-     * @param[in]  threshold   Threshold. When the threhold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value value to set when the condition is not respected.
-     * @param[in]  true_value  value to set when the condition is respected.
-     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
-     */
-    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** run binary thresholding on the given window */
-    void run_binary(const Window &window);
-    /** run range thresholding on the given window */
-    void run_range(const Window &window);
-
-    void (NEThresholdKernel::*_func)(const Window &window);
-
-    const ITensor *_input;  /**< Input */
-    ITensor       *_output; /**< Output */
-    uint8_t        _threshold;
-    uint8_t        _false_value;
-    uint8_t        _true_value;
-    uint8_t        _upper;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NETileKernel.h b/arm_compute/core/NEON/kernels/NETileKernel.h
deleted file mode 100644
index a64470ffd0..0000000000
--- a/arm_compute/core/NEON/kernels/NETileKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETILEKERNEL_H
-#define ARM_COMPUTE_NETILEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a tile operation */
-class NETileKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    NETileKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NETileKernel(const NETileKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NETileKernel &operator=(const NETileKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NETileKernel(NETileKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NETileKernel &operator=(NETileKernel &&) = default;
-    const char   *name() const override
-    {
-        return "NETileKernel";
-    }
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input     Source tensor. Data type supported: All.
-     * @param[out] output    Destination tensor. Same as @p input
-     * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     */
-    void configure(const ITensor *input, ITensor *output, const Multiples &multiples);
-    /** Static function to check if given info will lead to a valid configuration of @ref NETileKernel
-     *
-     * @param[in] input     Source tensor info. Data type supported: All.
-     * @param[in] output    Destination tensor info. Same as @p input
-     * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETILEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
deleted file mode 100644
index a14dece0d6..0000000000
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETRANSPOSEKERNEL_H
-#define ARM_COMPUTE_NETRANSPOSEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class NETransposeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NETransposeKernel";
-    }
-    /** Default constructor */
-    NETransposeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETransposeKernel(const NETransposeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETransposeKernel &operator=(const NETransposeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NETransposeKernel(NETransposeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NETransposeKernel &operator=(NETransposeKernel &&) = default;
-    /** Default destructor */
-    ~NETransposeKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel
-     *
-     * @param[in] input  Input tensor. Data types supported: All
-     * @param[in] output Output tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the transpose functions
-     *
-     * @param[in]  input  An input tensor. Data types supported: All
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using TransposeFunction = void(const ITensor *input, ITensor *output, const Window &window);
-    /** Transpose function to use for the particular tensor types passed to configure() */
-    TransposeFunction *_func;
-    const ITensor     *_input;
-    ITensor           *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NETRANSPOSEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h b/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
deleted file mode 100644
index 1ea3f974e7..0000000000
--- a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
-#define ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the Upsample layer kernel.*/
-class NEUpsampleLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEUpsampleLayerKernel";
-    }
-    /** Default constructor */
-    NEUpsampleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEUpsampleLayerKernel(const NEUpsampleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEUpsampleLayerKernel &operator=(const NEUpsampleLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEUpsampleLayerKernel(NEUpsampleLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEUpsampleLayerKernel &operator=(NEUpsampleLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEUpsampleLayerKernel() = default;
-    /** Set the input output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[out] output Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info   Contains stride information described in @ref Size2D.
-     * @param[in]  policy Defines the policy to fill the intermediate pixels.
-     *
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEUpsampleLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] output Destination tensor info. Data types supported: same as @p input.
-     * @param[in] info   Contains stride information described in @ref Size2D.
-     * @param[in] policy Defines the policy to fill the intermediate pixels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to run upsample layer (NCHW)
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, int S>
-    void upsample_nchw(const Window &window);
-    /** Function to run upsample layer (NHWC)
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, int S>
-    void upsample_nhwc(const Window &window);
-
-    using UpsampleFunctionPtr = void (NEUpsampleLayerKernel::*)(const Window &window);
-
-private:
-    UpsampleFunctionPtr _func;
-    const ITensor      *_input;
-    ITensor            *_output;
-    Size2D              _info;
-    unsigned int        _num_elems_processed_per_iteration_x;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h
deleted file mode 100644
index 61ca21eb48..0000000000
--- a/arm_compute/core/NEON/kernels/NEWarpKernel.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWARPKERNEL_H
-#define ARM_COMPUTE_NEWARPKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <array>
-#include <cstdint>
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for warp affine and warp perspective */
-class INEWarpKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    INEWarpKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWarpKernel(const INEWarpKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWarpKernel &operator=(const INEWarpKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEWarpKernel(INEWarpKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEWarpKernel &operator=(INEWarpKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[out] output                Destination tensor. Data type supported: U8.
-     * @param[in]  matrix                The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float.
-     *                                   The matrix argument requires 9 values, for the affine case the last 3 values are ignored.
-     * @param[in]  border_mode           Strategy to use for borders
-     * @param[in]  constant_border_value Constant value used for filling the border.
-     */
-    virtual void configure(const ITensor *input, ITensor *output, const std::array<float, 9> &matrix, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-protected:
-    /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_undefined(const Window &window) = 0;
-    /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_constant(const Window &window) = 0;
-    /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_replicate(const Window &window) = 0;
-    /** Common signature for all the specialised warp functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    void (INEWarpKernel::*_func)(const Window &window);
-
-    const ITensor *_input;                 /**< Input Tensor */
-    ITensor       *_output;                /**< Output Tensor */
-    uint8_t        _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */
-    std::array<float, 9> _matrix;          /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */
-};
-
-/** Template interface for the kernel to compute warp affine
- *
- */
-template <InterpolationPolicy interpolation>
-class NEWarpAffineKernel : public INEWarpKernel
-{
-private:
-    const char *name() const override
-    {
-        return "NEWarpAffineKernel";
-    }
-    // Inherited methods overridden:
-    void warp_undefined(const Window &window) override;
-    void warp_constant(const Window &window) override;
-    void warp_replicate(const Window &window) override;
-};
-
-/** Template interface for the kernel to compute warp perspective
- *
- */
-template <InterpolationPolicy interpolation>
-class NEWarpPerspectiveKernel : public INEWarpKernel
-{
-private:
-    const char *name() const override
-    {
-        return "NEWarpPerspectiveKernel";
-    }
-    // Inherited methods overridden:
-    void warp_undefined(const Window &window) override;
-    void warp_constant(const Window &window) override;
-    void warp_replicate(const Window &window) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWARPKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
deleted file mode 100644
index b68cb50c7b..0000000000
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class NEWeightsReshapeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWeightsReshapeKernel";
-    }
-    /** Constructor.*/
-    NEWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~NEWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/FP16/F32
-     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output The output tensor. Data types supported: Same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
-     *
-     * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared.
-     *                   Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32
-     * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_bias;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
deleted file mode 100644
index f22f18f09f..0000000000
--- a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the width concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEWidthConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWidthConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEWidthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWidthConcatenateLayerKernel(const NEWidthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWidthConcatenateLayerKernel &operator=(const NEWidthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWidthConcatenateLayerKernel(NEWidthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWidthConcatenateLayerKernel &operator=(NEWidthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEWidthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All
-     * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ITensor *input, unsigned int width_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEWidthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _width_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
deleted file mode 100644
index 1740df0312..0000000000
--- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/tensor.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the NEON kernel to perform Winograd input transform. */
-class INEWinogradLayerTransformInputKernel : public INEKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param num_threads The greatest number of threads that will be used to execute the transform.
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor in NHWC data layout format.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels,
-                           const PaddingType padding, ITensor *output, const int matrix_stride, ITensor *workspace) = 0;
-
-    /** Destructor */
-    virtual ~INEWinogradLayerTransformInputKernel()
-    {
-    }
-};
-
-/** NEON kernel to perform Winograd input transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformInputKernel(const NEWinogradLayerTransformInputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformInputKernel &operator=(const NEWinogradLayerTransformInputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformInputKernel(NEWinogradLayerTransformInputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformInputKernel &operator=(NEWinogradLayerTransformInputKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformInputKernel() = default;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    unsigned int get_input_storage_size(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Default constructor */
-    NEWinogradLayerTransformInputKernel();
-
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformInputKernel";
-    }
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor.  Data types supported: F16/F32. Layout supported NHWC.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    void configure(
-        const ITensor    *input_nhwc,
-        const int         num_batches,
-        const int         num_rows,
-        const int         num_cols,
-        const int         num_channels,
-        const PaddingType padding,
-        ITensor          *output,
-        const int         matrix_stride,
-        ITensor          *workspace) override;
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-    using WinogradConv = typename WinogradBase::template Convolution<T, T>;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformInputKernel
-     *
-     * @param[in] input         First tensor input info. Data types supported: F16/F32.
-     * @param[in] output        Output tensor info. Data types supported: same as @p input.
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using InputTransform = typename WinogradBase::template InputTransform<T, T>;
-
-    std::unique_ptr<InputTransform> _transform{ nullptr };
-    const ITensor                  *_input_nhwc;
-    int                             _num_batches;    /**< Number of batches in input tensor. */
-    int                             _num_rows;       /**< Number of rows in input tensor. */
-    int                             _num_cols;       /**< Number of columns in input tensor. */
-    int                             _num_channels;   /**< Number of channels in input tensor. */
-    PaddingType                     _padding;        /**< Padding type. */
-    ITensor                        *_output;         /**< Base of output matrices. */
-    int                             _matrix_stride;  /**< Stride between output matrices. */
-    int                             _padding_top;    /**< Padding to apply to the top of the image. */
-    int                             _padding_left;   /**< Padding to apply to the left of the image. */
-    int                             _padding_right;  /**< Padding to apply to the right of the image. */
-    int                             _padding_bottom; /**< Padding to apply to the bottom of the image. */
-    ITensor                        *_workspace;
-};
-
-/** Interface for the NEON kernel to perform Winograd output transform. */
-class INEWinogradLayerTransformOutputKernel : public INEKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    virtual std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows,    /* Number of rows in each feature map of the input tensor. */
-        int  num_cols,    /* Number of columns in each feature map of the input tensor. */
-        bool padding_same /* True if padding is SAME, false otherwise */
-    ) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    virtual void configure(
-        const ITensor              *biases,
-        const ITensor              *transformed_output,
-        const int                   matrix_stride,
-        ITensor                    *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensor                    *workspace,
-        const arm_gemm::Activation &activation) = 0;
-
-    virtual ~INEWinogradLayerTransformOutputKernel()
-    {
-    }
-};
-
-/** NEON kernel to perform Winograd output transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformOutputKernel";
-    }
-    /** Constructor */
-    NEWinogradLayerTransformOutputKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformOutputKernel(const NEWinogradLayerTransformOutputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformOutputKernel &operator=(const NEWinogradLayerTransformOutputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformOutputKernel() = default;
-
-    // Inherited methods overridden:
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows, /* Number of rows in each feature map of the input tensor. */
-        int  num_cols, /* Number of columns in each feature map of the input tensor. */
-        bool padding_same) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor with NHWC data layout, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    void configure(
-        const ITensor              *biases,
-        const ITensor              *transformed_output,
-        const int                   matrix_stride,
-        ITensor                    *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensor                    *workspace,
-        const arm_gemm::Activation &activation) override;
-
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformOutputKernel
-     *
-     * @param[in] input         Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32.
-     * @param[in] bias          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[in] output        Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using WinogradBase    = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv    = typename WinogradBase::template Convolution<T, T>;
-    using OutputTransform = typename WinogradBase::template OutputTransform<T, T>;
-
-    std::unique_ptr<OutputTransform> _transform{ nullptr };
-    const ITensor                   *_biases;
-    const ITensor                   *_transformed_output;
-    ITensor                         *_workspace;
-    int                              _matrix_stride;
-    int                              _matrix_row_stride;
-    ITensor                         *_output_nhwc;
-    int                              _num_batches;
-    int                              _num_rows;
-    int                              _num_cols;
-    int                              _num_channels;
-};
-
-/** Interface for the NEON kernel to perform Winograd weights transform. */
-class INEWinogradLayerTransformWeightsKernel : public INEKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWinogradLayerTransformWeightsKernel(const INEWinogradLayerTransformWeightsKernel &) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWinogradLayerTransformWeightsKernel &operator=(const INEWinogradLayerTransformWeightsKernel &) = default;
-    /** Allow instances of this class to be moved */
-    INEWinogradLayerTransformWeightsKernel(INEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEWinogradLayerTransformWeightsKernel &operator=(INEWinogradLayerTransformWeightsKernel &&) = default;
-
-    INEWinogradLayerTransformWeightsKernel()
-    {
-    }
-    virtual ~INEWinogradLayerTransformWeightsKernel()
-    {
-    }
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0;
-    /** Gets the stride between matrices in the kernel worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0;
-
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-
-    virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
-     *
-     * @param[in] input   First tensor input info. Data types supported: F16/F32.
-     * @param[in] weights Weights tensor info. Data types supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights);
-};
-
-/** NEON kernel to perform Winograd weights transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformWeightsKernel(const NEWinogradLayerTransformWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformWeightsKernel &operator=(const NEWinogradLayerTransformWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformWeightsKernel(NEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformWeightsKernel &operator=(NEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformWeightsKernel() = default;
-
-    /** Default constructor. */
-    NEWinogradLayerTransformWeightsKernel();
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformWeightsKernel";
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
-     *
-     * @param[in] input         Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout).
-     *                          kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32.
-     * @param[in] output        Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-
-#ifndef DOXYGEN_SKIP_THIS
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-    void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
-#endif /* DOXYGEN_SKIP_THIS */
-
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_output_channels, int num_input_channels) const override;
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    using WinogradBase     = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv     = typename WinogradBase::template Convolution<T, T>;
-    using WeightsTransform = typename WinogradBase::template WeightsTransform<T, T>;
-
-    std::unique_ptr<WeightsTransform> _transform{ nullptr };
-    const ITensor                    *_weights_hwio;
-    ITensor                          *_output;
-    int                               _matrix_stride;
-    int                               _num_output_channels;
-    int                               _num_input_channels;
-};
-
-/** NEON kernel to perform Winograd. */
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerConfiguration
-{
-public:
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-
-    using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
-
-    using TransformInputKernel   = NEWinogradLayerTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformWeightsKernel = NEWinogradLayerTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformOutputKernel  = NEWinogradLayerTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-};
-
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h b/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
deleted file mode 100644
index 0fd3f8ce67..0000000000
--- a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEYOLOLAYERKERNEL_H
-#define ARM_COMPUTE_NEYOLOLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the YOLO layer kernel. */
-class NEYOLOLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEYOLOLayerKernel";
-    }
-    /** Constructor */
-    NEYOLOLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEYOLOLayerKernel(const NEYOLOLayerKernel &) = delete;
-    /** Default move constructor */
-    NEYOLOLayerKernel(NEYOLOLayerKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEYOLOLayerKernel &operator=(const NEYOLOLayerKernel &) = delete;
-    /** Default move assignment operator */
-    NEYOLOLayerKernel &operator=(NEYOLOLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEYOLOLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
-     *
-     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                             of the activation function. Data types supported: F16/F32.
-     * @param[out]     output      Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info    Activation layer parameters.
-     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEYOLOLayerKernel
-     *
-     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                        of the activation function. Data types supported: F16/F32.
-     * @param[in] output      Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info    Activation layer information.
-     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to run YOLO layer
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T, int S>
-    void yolo_layer_nchw(const Window &window);
-    /** Function to run YOLO layer on tensors with NHWC format
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void yolo_layer_nhwc(const Window &window);
-    /** Common signature for all the yolo layer functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using YOLOFunctionPtr = void (NEYOLOLayerKernel::*)(const Window &window);
-
-private:
-    YOLOFunctionPtr     _func;
-    ITensor            *_input;
-    ITensor            *_output;
-    ActivationLayerInfo _act_info;
-    int32_t             _num_classes;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEYOLOLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp b/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp
deleted file mode 100644
index 4ff83fbc51..0000000000
--- a/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <array>
-#include <algorithm>
-#include <initializer_list>
-
-#include <cassert>
-
-namespace arm_gemm {
-
-template<unsigned int D>
-class NDRange {
-private:
-    std::array<unsigned int, D> m_sizes {};
-    std::array<unsigned int, D> m_totalsizes {};
-
-    class NDRangeIterator {
-    private:
-        const NDRange &m_parent;
-        unsigned int m_pos = 0;
-        unsigned int m_end = 0;
-
-    public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
-
-        bool done() const {
-            return (m_pos >= m_end);
-        }
-
-        unsigned int dim(unsigned int d) const {
-            unsigned int r = m_pos;
-
-            if (d < (D - 1)) {
-                r %= m_parent.m_totalsizes[d];
-            }
-
-            if (d > 0) {
-                r /= m_parent.m_totalsizes[d-1];
-            }
-
-            return r;
-        }
-
-        bool next_dim0() {
-            m_pos++;
-
-            return !done();
-        }
-
-        bool next_dim1() {
-            m_pos += m_parent.m_sizes[0] - dim(0);
-
-            return !done();
-        }
-
-        unsigned int dim0_max() const {
-            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
-
-            return dim(0) + offset;
-        }
-    };
-
-public:
-    NDRange& operator=(const NDRange& rhs)=default;
-    NDRange(const NDRange& rhs)           =default;
-
-    template <typename... T>
-    NDRange(T... ts)
-    : m_sizes{ts...}
-    {
-        unsigned int t=1;
-
-        for (unsigned int i=0; i<D; i++) {
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-    NDRange(const std::array<unsigned int, D>& n)
-    : m_sizes(n)
-    {
-        unsigned int t=1;
-
-        for (unsigned int i=0; i<D; i++) {
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-    NDRangeIterator iterator(unsigned int start, unsigned int end) const {
-        return NDRangeIterator(*this, start, end);
-    }
-
-    unsigned int total_size() const {
-        return m_totalsizes[D - 1];
-    }
-
-    unsigned int get_size(unsigned int v) const {
-        return m_sizes[v];
-    }
-};
-
-/** NDCoordinate builds upon a range, but specifies a starting position
- * in addition to a size which it inherits from NDRange
- */
-template<unsigned int N>
-class NDCoordinate : public NDRange<N> {
-    using int_t     =unsigned int;
-    using ndrange_t = NDRange<N>;
-
-    std::array<int_t, N> m_positions {};
-public:
-    NDCoordinate& operator=(const NDCoordinate& rhs)=default;
-    NDCoordinate(const NDCoordinate& rhs)           =default;
-    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list)
-    {
-        std::array<int_t, N> sizes{};
-
-        std::size_t i = 0;
-        for(auto& p : list) {
-            m_positions[i]= p.first;
-            sizes[i++]    = p.second;
-        }
-
-        //update the parents sizes
-        static_cast<ndrange_t&>(*this) = ndrange_t(sizes);
-    }
-
-    int_t get_position(int_t d) const {
-        assert(d < m_positions.size());
-        return m_positions[d];
-    }
-
-    void set_position(int_t d, int_t v) {
-        assert(d < size(m_positions));
-        assert(v < ndrange_t::get_size(d));
-
-        m_positions[d] = v;
-    }
-
-    int_t get_position_end(int_t d) const {
-        return get_position(d) + NDRange<N>::get_size(d);
-    }
-}; //class NDCoordinate
-
-/** @returns the number of dimensions in the NDRange which have none-1 values
- * IE there is actual work in these dimensions that can be broken up
- */
-template<unsigned int N>
-std::size_t ndrange_popcount(const NDRange<N>& ndr) {
-    std::size_t count = 0;
-
-    for(unsigned int d = 0; d != N; ++d) {
-        if(ndr.get_size(d) != 1)
-            ++count;
-    }
-    return count;
-}
-
-} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/Helpers.h b/arm_compute/core/NEON/kernels/assembly/Helpers.h
deleted file mode 100644
index 9372e05295..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/Helpers.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_HELPERS_H
-#define ARM_COMPUTE_ASSEMBLY_HELPERS_H
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Utils.h"
-
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-namespace arm_compute
-{
-/** Block sizes to use to break the M, N, K dimension */
-struct BlockSizes
-{
-    unsigned int k_block{ 0 };             /**< Block size alon the K dimension */
-    unsigned int x_block{ 0 };             /**< Block size along the N (x) dimension */
-    unsigned int m_round{ 0 };             /**< Block size along the M dimension (Must be a multiple of strategy_out_height) */
-    unsigned int strategy_out_height{ 0 }; /**< Number of rows (M) processed by the selected strategy */
-};
-
-/** Extracts the kernel description of the selected kernel by the GEMM backend heuristics
- *
- * @param[in] input_type        Data type of the input tensor.
- * @param[in] ci                CPU information.
- * @param[in] num_threads       Maximum number of threads that might be used for the calculations.
- * @param[in] p                 M, N, K sizes.
- * @param[in] activation        Activation struct
- * @param[in] pretranspose_hint Is B also pretransposed ?
- *
- * @return Kernel description that the assembly heuristics picked for the given configuration
- */
-arm_gemm::KernelDescription get_gemm_info(DataType                            input_type,
-                                          const CPUInfo                      &ci,
-                                          const unsigned int                  num_threads,
-                                          const INEGEMMWrapperKernel::Params &p,
-                                          arm_gemm::Activation                activation,
-                                          bool                                pretranspose_hint);
-
-/** Calculate the recommended block sizes to use based on the CPU cache sizes and the strategy which will be used
- *
- * @param[in] ci CPU information.
- * @param[in] M  M dimension.
- * @param[in] N  N dimension.
- * @param[in] K  K dimension.
- *
- * @return Recommeded block sizes to use for the given M, N, K dimensions.
- */
-template <typename strategy>
-BlockSizes calculate_block_sizes(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
-{
-    BlockSizes bs;
-
-    using Toi = typename strategy::operand_type;
-
-    const unsigned int L1_size = ci.get_L1_cache_size();
-    const unsigned int L2_size = ci.get_L2_cache_size();
-
-    // Work out blocking parameters
-
-    // k_block: Find out how much of the larger array can be loaded into half the cache.
-    // This should account for associative caches.
-    bs.k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-    // Needs to be (at least a single) multiple of the K unroll level.
-    bs.k_block /= strategy::k_unroll();
-    bs.k_block = std::max(bs.k_block, 1U) * strategy::k_unroll();
-
-    // Now tune to presented problem size; this is how many blocks we need.
-    int num_k_blocks = DIV_CEIL(K, bs.k_block);
-
-    // So divide the space equally into that many blocks.
-    bs.k_block = DIV_CEIL(K, num_k_blocks);
-
-    // And round UP to the K unroll level required.
-    bs.k_block = ceil_to_multiple(bs.k_block, strategy::k_unroll());
-
-    // x_block: Work out how many rows (of length k_block) will fit in the L2
-    // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-    bs.x_block = (((L2_size * 9) / 10) - (bs.k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / (sizeof(Toi) * bs.k_block);
-
-    // Needs to be (at least a single) multiple of the kernel output width.
-    bs.x_block /= strategy::out_width();
-    bs.x_block = std::max(bs.x_block, 1U) * strategy::out_width();
-
-    // And tune to the presented problem size.
-    int num_x_blocks = DIV_CEIL(N, bs.x_block);
-    bs.x_block       = DIV_CEIL(N, num_x_blocks);
-
-    bs.x_block = ceil_to_multiple(bs.x_block, strategy::out_width());
-
-    // Work out the rounded size of M - needed for some buffers.
-    bs.m_round             = ceil_to_multiple(M, strategy::out_height());
-    bs.strategy_out_height = strategy::out_height();
-
-    return bs;
-}
-
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_HELPERS_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
deleted file mode 100644
index f152ab5f61..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_INEGEMMWRAPPERKERNEL_H
-#define ARM_COMPUTE_INEGEMMWRAPPERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for all the arm_gemm Gemms
- */
-class INEGEMMWrapperKernel : public INEKernel
-{
-public:
-    /** Parameters defining the dimensions of the matrices being multiplied */
-    struct Params
-    {
-        unsigned int M{ 0 };       /**< Rows in output matrix C (and input matrix A). */
-        unsigned int N{ 0 };       /**< Columns in output matrix C (and input matrix B). */
-        unsigned int K{ 0 };       /**< Columns of input matrix A (= rows of input matrix B). */
-        unsigned int batches{ 0 }; /**< Number of "batched" GEMMs (unique A and C, shared B). */
-        unsigned int multis{ 0 };  /**< Number of "multi" GEMMs (unique A, B and C). */
-    };
-
-    static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info);
-
-    /** Constructor */
-    INEGEMMWrapperKernel();
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel(const INEGEMMWrapperKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel &operator=(const INEGEMMWrapperKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel(INEGEMMWrapperKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel &operator=(INEGEMMWrapperKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]  a         Input tensor (Matrix A)
-     * @param[in]  b         Input tensor (Matrix B)
-     * @param[out] c         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  alpha     Scalar multiplier to apply to AB matrix product.
-     * @param[in]  beta      Scalar multiplier to apply to input C matrix before adding product.
-     * @param[in]  gemm_info GEMM meta-data
-     */
-    void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Called as part of configure() after _a, _b, _c and _params have been set.
-     *
-     * @param[in] alpha Scalar multiplier to apply to AB matrix product.
-     * @param[in] beta  Scalar multiplier to apply to input C matrix before adding product.
-     *
-     * @return A 3D execution window.
-     */
-    virtual Window configure_internal(float alpha, float beta) = 0;
-
-    /** Run the kernel from the start to the end offset in window.
-     *
-     * @param[in] window       Window to use for the iteration
-     * @param[in] start_offset Where to start iterating from (In Window coordinates)
-     * @param[in] end_offset   Where to stop iterating (In Window coordinates).
-     * @param[in] info         Info about executing thread and CPU.
-     */
-    virtual void run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info) = 0;
-
-    const ITensor *_a;
-    const ITensor *_b;
-    ITensor       *_c;
-    Params         _params;
-    GEMMInfo       _gemm_info;
-
-private:
-    Window      _window3d;
-    TensorShape _window_shape;
-};
-
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_INEGEMMRAPPERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
deleted file mode 100644
index 8a9fb82b4a..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionAssemblyKernelWrapper";
-    }
-
-    /** Default constructor */
-    NEDepthwiseConvolutionAssemblyKernelWrapper()
-        : _kernel(nullptr)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel Pointer to an assembly kernel implementation.
-     */
-    void configure(depthwise::IDepthwiseConvolution *kernel)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel = kernel;
-        Window win;
-        win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
-        INEKernel::configure(win);
-    }
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-        auto first = window.x().start();
-        auto last  = window.x().end();
-        _kernel->run(first, last, info.thread_id);
-    }
-
-private:
-    depthwise::IDepthwiseConvolution *_kernel;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
deleted file mode 100644
index 0e3dd74577..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "gemm_common.hpp"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
-  * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
-  * of NEGEMMAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
-  * in the context of an NEFunctions.
-  *
-  * The type T is the type of the actual kernel implemented in assembly which is of type
-  *         template<typename To, typename Tr> class GemmCommon
-  *
-  *
-  */
-template <typename TypeInput, typename TypeOutput>
-class NEGEMMAssemblyWrapperKernel final : public INEKernel
-{
-public:
-    /** Constructor
-     */
-    NEGEMMAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("NEGEMMAssemblyWrapperKernel")
-    {
-    }
-
-    NEGEMMAssemblyWrapperKernel(NEGEMMAssemblyWrapperKernel &)  = delete;
-    NEGEMMAssemblyWrapperKernel(NEGEMMAssemblyWrapperKernel &&) = default;
-    NEGEMMAssemblyWrapperKernel &operator=(NEGEMMAssemblyWrapperKernel &) = delete;
-
-    const char *name() const override
-    {
-        return _name.c_str();
-    }
-
-
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        auto win=arm_gemm::to_ndcoord(window);
-
-        arm_gemm::ndcoord_t thread_locator { };
-
-        _kernel->execute(win, thread_locator, info.thread_id);
-    }
-
-    // Inherited methods overridden:
-    void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        //convert between arm_compute and arm_gemm types
-        auto ndc_win = arm_gemm::to_ndcoord(window);
-        auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
-
-        _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
-    }
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel      Pointer to an assembly kernel implementation.
-     * @param[in] num_threads Number of concurrent threads which will execute the kernel.
-     */
-    void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel         = kernel;
-
-        Window win = to_window(kernel->get_window_size());
-
-        INEKernel::configure(win);
-
-        if(!kernel_name_tag.empty())
-        {
-            _name += "/" + kernel_name_tag;
-        }
-    }
-
-private:
-    arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string _name;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
deleted file mode 100644
index 7723224ec8..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <memory>
-#include <cstring>
-
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
-namespace arm_gemm {
-
-enum class GemmMethod
-{
-    DEFAULT,
-    GEMV_BATCHED,
-    GEMV_PRETRANSPOSED,
-    GEMV_NATIVE_TRANSPOSED,
-    GEMM_NATIVE,
-    GEMM_HYBRID,
-    GEMM_INTERLEAVED,
-    GEMM_INTERLEAVED_2D,
-    QUANTIZE_WRAPPER,
-    GEMM_HYBRID_QUANTIZED
-};
-
-struct KernelDescription
-{
-    GemmMethod   method      = GemmMethod::DEFAULT;
-    std::string  name        = "";
-    bool         is_default  = false;
-
-    KernelDescription(GemmMethod m, std::string n, bool d=false) : method(m), name(n), is_default(d) { }
-    KernelDescription() noexcept  { }
-};
-
-struct GemmConfig
-{
-    GemmMethod   method           = GemmMethod::DEFAULT;
-    std::string  filter           = "";
-    unsigned int inner_block_size = 0;
-    unsigned int outer_block_size = 0;
-
-    GemmConfig(GemmMethod method) : method(method) { }
-    GemmConfig() { }
-};
-
-struct Activation
-{
-    enum class Type {
-        None,
-        ReLU,
-        BoundedReLU
-    };
-
-    Type    type;
-    float   param1;
-    float   param2;
-
-    Activation(Type type=Type::None, float p1=0.0f, float p2=0.0f) : type(type), param1(p1), param2(p2) { }
-};
-
-struct GemmArgs
-{
-public:
-    const CPUInfo    *_ci;
-    unsigned int      _Msize;
-    unsigned int      _Nsize;
-    unsigned int      _Ksize;
-    unsigned int      _nbatches;
-    unsigned int      _nmulti;
-    bool              _trA;
-    bool              _trB;
-    Activation        _act;
-    int               _maxthreads;
-    bool              _pretransposed_hint;
-    const GemmConfig *_cfg;
-
-    GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
-             const unsigned int K, const unsigned int nbatches,
-             const unsigned int nmulti, const bool trA, const bool trB,
-             Activation act, const int maxthreads,
-             const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) :
-             _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
-             _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads),
-             _pretransposed_hint(pretransposed_hint), _cfg(cfg)
-    {
-    }
-};
-
-struct Requantize32
-{
-public:
-    const int32_t  *bias = nullptr;
-    size_t          bias_multi_stride = 0;
-    int32_t         a_offset = 0;
-    int32_t         b_offset = 0;
-    int32_t         c_offset = 0;
-    bool            per_channel_requant = false;
-    int32_t         per_layer_shift = 0;
-    int32_t         per_layer_mul = 0;
-    const int32_t  *per_channel_shifts = nullptr;
-    const int32_t  *per_channel_muls = nullptr;
-    int32_t         minval = 0;
-    int32_t         maxval = 0;
-
-    Requantize32() = default;
-
-    // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
-        minval(minv), maxval(maxv)
-    {
-    }
-
-    // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 const int32_t *requant_shifts, const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(true), per_channel_shifts(requant_shifts), per_channel_muls(requant_muls),
-        minval(minv), maxval(maxv)
-    {
-    }
-};
-
-struct Nothing
-{
-};
-
-template<typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
-
-/* Low level API calls.
- * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
-
-/* get_gemm_method(): Given the templated types and provided parameters,
- * which is the preferred method to implement this GEMM?  */
-template<typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & ={});
-
-template<typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & ={});
-
-template<typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & ={});
-
-} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
deleted file mode 100644
index 6f345c1721..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp"
-
-#include <cassert>
-
-/* This file contains mapping between integral types used in arm_compute and arm_gemm
- * These two codebases both require a degree of separation for the sake of modularity
- * so maintain their own types which represent similar information.
- */
-
-namespace arm_gemm {
-
-//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
-    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
-
-using ndrange_t=NDRange<ndrange_max>;
-using ndcoord_t=NDCoordinate<ndrange_max>;
-
-/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
- *
- * As `NDRange<T>` does not not encode start positions, we specify
- * the start to be zero in the produced `arm_compute::Window`
- *
- * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
- */
-inline arm_compute::Window to_window(const ndrange_t& ndr) {
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i!=ndrange_max; ++i) {
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
-    }
-
-    return win;
-}
-
-/*
- * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
- *
- * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
- */
-inline arm_compute::Window to_window(const ndcoord_t& ndc) {
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i!=ndrange_max; ++i) {
-        const auto start = ndc.get_position(i);
-        const auto size  = ndc.get_size(i);
-        const auto stop  = start + size;
-
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(start, stop));
-    }
-
-    return win;
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
- *
- * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
- * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
- * @return the resultant ndrange_t
- */
-inline ndrange_t to_ndrange(const arm_compute::Window& win) {
-    return {
-        static_cast<unsigned int>(win[0].end() - win[0].start()),
-        static_cast<unsigned int>(win[1].end() - win[1].start()),
-        static_cast<unsigned int>(win[2].end() - win[2].start()),
-        static_cast<unsigned int>(win[3].end() - win[3].start()),
-        static_cast<unsigned int>(win[4].end() - win[4].start()),
-        static_cast<unsigned int>(win[5].end() - win[5].start())
-    };
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
- * @return the resultant ndcoord_t
- */
-inline ndcoord_t to_ndcoord(const arm_compute::Window& win) {
-    return {
-        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
-        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
-        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
-        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
-        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
-        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
-    };
-}
-
-} //namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
deleted file mode 100644
index 8d3db4adf2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-
-namespace arm_gemm
-{
-using CPUModel = arm_compute::CPUModel;
-using CPUInfo  = arm_compute::CPUInfo;
-} // namespace arm_compute
-
-
-
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
deleted file mode 100644
index ea9b524e15..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
-
-#include <cstddef>
-#include <cassert>
-
-#define UNUSED(x)   (void)(x)
-
-namespace arm_gemm {
-
-// Abstract class for the GEMM/GEMV functions.
-//
-// GEMM implementations may be "native" (never require any input
-// permutation), "pretransposed" (require permutation up-front) or require
-// working space (permute as they go along).  This interface should support
-// all of them.
-
-// The real GemmCommon class is templated based on the operand and return
-// type.  This is an interface class which is independent of those types.
-class IGemmCommon {
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides.  This "generic" version uses void *s, the preferred version
-     * is the one provided by templated GemmCommon (below) which takes
-     * appropriately typed pointers.  If B is pretransposed (see below) then
-     * the settings for B here are ignored.
-     */
-    virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                          void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) = 0;
-
-    /** @returns an ndrange containing ranges of the compute space which can be
-     * broken up and parallelised over
-     */
-    virtual ndrange_t get_window_size() const = 0;
-
-    /* The maximum thread count is specified when the GEMM is created.  Some
-     * implementations need to know how many threads will actually run in
-     * order to work properly.
-     *
-     * In some cases, after creating the GEMM the number of threads needs to
-     * be reduced (e.g. not enough work to split across threads).  This
-     * method allows the number of actual threads to be run to be set (must
-     * be equal or lower).
-     *
-     * This has an empty default implementation, as GEMMs which don't care
-     * about thread count can safely ignore this.
-     */
-    virtual void set_nthreads(int) { };
-
-    /* Whether this GEMM can be dynamically scheduled or not. */
-    virtual bool supports_dynamic_scheduling() const { return false; }
-
-    /** Main execute member fucntion
-     * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
-     * @param [in] thread_locator where are we inside of the thread space
-     * @naram [in] threadid       a unique threadid
-     */
-    virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0;
-
-    /*** Working space interface (optional) ***/
-    /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
-    virtual size_t get_working_size() const { return 0; }
-    /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) { };
-
-    /*** "Pretransposed" interface (optional) ***/
-    /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
-    virtual bool B_is_pretransposed() const { return false; }
-    /* Does pretranspose still need to be done? */
-    virtual bool B_pretranspose_required() const { return false; }
-    /* Total number of bytes of space needed for pretransposed arrays. */
-    virtual size_t get_B_pretransposed_array_size() const { return 0; }
-    /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
-    /* The "real" version of this depends on the templated operand type (see below).  */
-    virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
-    /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
-    virtual void set_pretransposed_B_data(void *) { }
-
-    /*** "Quantized bias" interface (optional) ***/
-    /* Set the bias vector for quantized GEMMs */
-    virtual void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride)
-    {
-        UNUSED(bias);
-        UNUSED(bias_multi_stride);
-    }
-
-    // Destructor
-    virtual ~IGemmCommon() { }
-};
-
-/* "Real" GemmCommon class which is templated on the operand and return types.
- *
- * In addition to correctly typed versions of the functions that operate on
- * operand and return data, this class provides a default implementation of
- * 'set_arrays' to capture the provided arguments in protected class
- * members, as essentially any implementation will need these.
- */
-template<typename To, typename Tr>
-class GemmCommon : public IGemmCommon {
-protected:
-    const To *_Aptr=nullptr;
-    int _lda=0;
-    int _A_batch_stride=0;
-    int _A_multi_stride=0;
-    const To *_Bptr=nullptr;
-    int _ldb=0;
-    int _B_multi_stride=0;
-    Tr *_Cptr=nullptr;
-    int _ldc=0;
-    int _C_batch_stride=0;
-    int _C_multi_stride=0;
-    const Tr *_bias=nullptr;
-    int _bias_multi_stride=0;
-
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides (templated version with appropriate types). */
-    virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */   const int bias_multi_stride) {
-        _Aptr = A;
-        _lda = lda;
-        _A_batch_stride = A_batch_stride;
-        _A_multi_stride = A_multi_stride;
-        _Bptr = B;
-        _ldb = ldb;
-        _B_multi_stride = B_multi_stride;
-        _Cptr = C;
-        _ldc = ldc;
-        _C_batch_stride = C_batch_stride;
-        _C_multi_stride = C_multi_stride;
-        _bias = bias;
-        _bias_multi_stride = bias_multi_stride;
-    }
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) override {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
-                   static_cast<const To *>(B), ldb, B_multi_stride,
-                   static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
-                   static_cast<const Tr *>(bias), bias_multi_stride);
-    }
-
-    /*** "Pretransposed" interface ***/
-
-    /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
-    /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override {
-        pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
-    }
-};
-
-template<typename GemmKernel>
-inline
-int unsigned get_total_window_size(const GemmKernel& kernel)
-{
-    auto window=kernel.get_window_size();
-
-    unsigned int total = 1;
-    for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i)
-    {
-        total *= window.get_size(i);
-    }
-
-    return total;
-}
-
-} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
deleted file mode 100644
index 091b1652c9..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-namespace neon_convolution_kernels
-{
-
-enum class ActivationFunction
-{
-  None,
-  ReLU,
-  ReLU6,
-};
-
-}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp b/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp
deleted file mode 100644
index 799e95d3e6..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#ifdef ALLOC_ALIGN
-#define ALLOCATE(x) aligned_alloc(ALLOC_ALIGN, x)
-#else
-#define ALLOCATE(x) malloc(x)
-#endif
diff --git a/arm_compute/core/NEON/kernels/convolution/common/arm.hpp b/arm_compute/core/NEON/kernels/convolution/common/arm.hpp
deleted file mode 100644
index 90e7828553..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/arm.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/** Sets the macro __arm_any__ if compiling for Aarch32 or Aarch64.
- *  Includes `arm_neon.h` if compiling for either architecture.
- */
-
-#ifdef __arm__
-#define __arm_any__
-#endif  // __arm__
-
-#ifdef __aarch64__
-#define __arm_any__
-#endif  // __aarch64__
-
-#ifdef __arm_any__
-#include <arm_neon.h>
-#endif  // __arm_any__
diff --git a/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp b/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp
deleted file mode 100644
index 2ab2597785..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-enum PaddingType {
-  PADDING_SAME, PADDING_VALID
-};
diff --git a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
deleted file mode 100644
index 97b21e0ff5..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <cstddef>
-
-// Utilities for copying tensor tiles and adding/removing padding.
-namespace padding
-{
-
-/* Copy a tile and apply padding to the output copy.
- */
-template <typename T>
-void copy_and_pad_tile(
-  unsigned int tile_rows,
-  unsigned int tile_cols,
-  unsigned int n_channels,
-  const T *inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  T* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride,
-  unsigned int pad_top,
-  unsigned int pad_left,
-  unsigned int pad_bottom,
-  unsigned int pad_right,
-  T pad_value=static_cast<T>(0)
-);
-
-/** Copy a tile and remove padding elements in the output.
- */
-template <unsigned int TileRows, unsigned int TileCols>
-class CopyCropped
-{
-  public:
-    static void execute(
-      size_t size,  // Amount of data to copy
-      const void *inptr,
-      size_t in_row_stride,
-      size_t in_col_stride,
-      void *outptr,
-      size_t out_row_stride,
-      size_t out_col_stride,
-      unsigned int pad_top,
-      unsigned int pad_left,
-      unsigned int pad_bottom,
-      unsigned int pad_right
-    );
-};
-
-template <typename T>
-void crop_and_copy_tile(
-  unsigned int tile_rows,
-  unsigned int tile_cols,
-  unsigned int n_channels,
-  const T *inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  T *outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride,
-  unsigned int crop_top,
-  unsigned int crop_left,
-  unsigned int crop_bottom,
-  unsigned int crop_right
-);
-
-}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/perf.h b/arm_compute/core/NEON/kernels/convolution/common/perf.h
deleted file mode 100644
index 3c0d36646d..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/perf.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* Prototypes from perf.c */
-
-void start_counter(int fd);
-long long get_counter(int fd);
-long long stop_counter(int fd);
-int open_instruction_counter(void);
-int open_cycle_counter(void);
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
deleted file mode 100644
index 6029cb67e3..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include <cstdint>
-
-namespace qasymm8
-{
-
-struct QAsymm8Params
-{
-  uint8_t quantize(float value) const;
-  float dequantize(uint8_t value) const;
-
-  uint8_t offset;
-  float scale;
-};
-
-struct QAsymm8RescaleParams
-{
-  static QAsymm8RescaleParams make_rescale_params(
-    const QAsymm8Params& weight_quant,
-    const QAsymm8Params& input_quant,
-    const QAsymm8Params& output_quant
-  );
-
-  QAsymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
-
-  const int32_t shift, multiplier;
-  const float rescale;
-};
-
-}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
deleted file mode 100644
index 41bfbe4d8a..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include <cstdint>
-#include <vector>
-#include "qasymm8.hpp"
-
-
-namespace qsymm8 {
-
-struct QSymm8Params {
-  int8_t quantize(float value) const;
-  float dequantize(int8_t value) const;
-
-  float scale;
-};
-
-struct QSymm8RescaleParams {
-  static QSymm8RescaleParams
-  make_rescale_params(const QSymm8Params &weight_quant,
-                      const QSymm8Params &input_quant,
-                      const QSymm8Params &output_quant);
-
-  QSymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
-
-  const int32_t shift, multiplier;
-  const float rescale;
-};
-
-struct QSymm8PerChannelParams {
-  int8_t quantize(float value, float scale) const;
-  float dequantize(int8_t value, float scale) const;
-
-  std::vector<float> scales;
-};
-
-struct QSymm8PerChannelRescaleParams {
-  static QSymm8PerChannelRescaleParams
-  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
-                      const QSymm8PerChannelParams &input_quant,
-                      const QSymm8PerChannelParams &output_quant);
-
-  static QSymm8PerChannelRescaleParams
-  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
-                      const qasymm8::QAsymm8Params &input_quant,
-                      const qasymm8::QAsymm8Params &output_quant);
-
-  QSymm8PerChannelRescaleParams(std::vector<int32_t>& shift, std::vector<int32_t>& multiplier, std::vector<float>& rescale);
-
-  std::vector<int32_t>  shifts, multipliers;
-  std::vector<float> rescales;
-};
-
-} // namespace qsymm8
diff --git a/arm_compute/core/NEON/kernels/convolution/common/shims.hpp b/arm_compute/core/NEON/kernels/convolution/common/shims.hpp
deleted file mode 100644
index 243d305e19..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/shims.hpp
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#ifndef DOXYGEN_SKIP_THIS
-#include <cstdint>
-#endif /* DOXYGEN_SKIP_THIS */
-#include "arm.hpp"
-
-namespace reorder {
-/** Re-order a tensor from NCHW format to NHWC.
- *
- * @note The stride parameters are optional and are provided to allow padding in either input or output tensors.
- *
- * @param[in] in Input tensor in NCHW format.
- * @param[out] out Output tensor, to be written in NHWC format.
- * @param n_batches Number of batches in the tensors.
- * @param n_channels Number of channels in the tensors
- * @param n_rows Height of the tensor
- * @param n_cols Width of the tensor
- * @param in_batch_stride Stride over batches in the input tensor. If `0` defaults to `n_channels * in_channel_stride`.
- * @param in_channel_stride Stride over channels in the input tensor. If `0` defaults to `n_rows * in_row_stride`.
- * @param in_row_stride Stride over rows in the input tensor. If `0` defaults to `n_cols`.
- * @param out_batch_stride Stride over batches in the output tensor. If `0` defaults to `n_rows * out_row_stride`.
- * @param out_row_stride Stride over rows in the output tensor. If `0` defaults to `n_cols * out_col_stride`.
- * @param out_col_stride Stride over columns in the output tensor. If `0` defaults to `n_channels`.
- */
-template <typename T>
-inline void nchw_to_nhwc(
-  const T* const in,
-  T* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride=0,
-  int in_channel_stride=0,
-  int in_row_stride=0,
-  int out_batch_stride=0,
-  int out_row_stride=0,
-  int out_col_stride=0
-);
-
-/** Re-order a tensor from NHWC format to NCHW.
- *
- * @note The stride parameters are optional and are provided to allow padding in either input or output tensors.
- *
- * @param[in] in Input tensor in NHWC format.
- * @param[out] out Output tensor, to be written in NCHW format.
- * @param n_batches Number of batches in the tensors.
- * @param n_rows Height of the tensor
- * @param n_cols Width of the tensor
- * @param n_channels Number of channels in the tensors
- * @param in_batch_stride Stride over batches in the input tensor. If `0` defaults to `n_rows * in_row_stride`.
- * @param in_row_stride Stride over rows in the input tensor. If `0` defaults to `n_cols * in_col_stride`.
- * @param in_col_stride Stride over columns in the input tensor. If `0` defaults to `n_channels`.
- * @param out_batch_stride Stride over batches in the output tensor. If `0` defaults to `n_channels * out_channel_stride`.
- * @param out_channel_stride Stride over channels in the output tensor. If `0` defaults to `n_rows * out_row_stride`.
- * @param out_row_stride Stride over rows in the output tensor. If `0` defaults to `n_cols`.
- */
-template <typename T>
-inline void nhwc_to_nchw(
-  const T* const in,  // Input data in NHWC form
-  T* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride=0,
-  int in_row_stride=0,
-  int in_col_stride=0,
-  int out_batch_stride=0,
-  int out_channel_stride=0,
-  int out_row_stride=0
-);
-
-/** Re-order a weight tensor from [Output feature map x Input feature map x
- *  Height x Width] format to [Height x Width x Input feature map x Output
- *  feature map] format.
- */
-template <typename T>
-inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
-  const T* const in,  // Input in [Output x Input x Height x Width] form
-  T* const out,       // Output in [Height x Width x Input x Output] form
-  const int n_output_feature_maps,
-  const int n_input_feature_maps,
-  const int n_rows,
-  const int n_cols,
-  int in_output_feature_map_stride=0,
-  int in_input_feature_map_stride=0,
-  int in_row_stride=0,
-  int out_row_stride=0,
-  int out_col_stride=0,
-  int out_input_feature_map_stride=0
-);
-
-/** Re-order a weight tensor from [Height x Width x Input feature map x Output
- *  feature map] format to [Output feature map x Input feature map x Height x
- *  Width] format.
- */
-template <typename T>
-inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
-  const T* const in,  // Input in [Height x Width x Input x Output] form
-  T* const out,       // Output in [Output x Input x Height x Width] form
-  const int n_rows,
-  const int n_cols,
-  const int n_input_feature_maps,
-  const int n_output_feature_maps,
-  int in_row_stride=0,
-  int in_col_stride=0,
-  int in_input_feature_map_stride=0,
-  int out_output_feature_map_stride=0,
-  int out_input_feature_map_stride=0,
-  int out_row_stride=0
-);
-
-/*****************************************************************************/
-/* 32-bit implementation : NCHW -> NHWC
- */
-template <>
-inline void nchw_to_nhwc(
-  const int32_t* const in,
-  int32_t* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  typedef int32_t T;
-
-  // Fill in the stride values
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
-  in_channel_stride = (in_channel_stride) ? in_channel_stride
-                                          : n_rows * in_row_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_channels * in_channel_stride;
-
-  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_rows * out_row_stride;
-
-  // Perform the re-ordering
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_row = in_batch + i*in_row_stride;
-      T* const out_row = out_batch + i*out_row_stride;
-
-      int j = 0, j_remaining = n_cols;
-#ifdef __arm_any__
-      for (; j_remaining >= 4; j += 4, j_remaining -= 4)
-      {
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 4; c += 4, c_remaining -= 4)
-        {
-          // Read 4 channels worth of 4 columns, then zip to produce 4 columns
-          // worth of 4 channels.
-          int32x4_t channel_pixels[4];
-          channel_pixels[0] = vld1q_s32(in_row + (c + 0)*in_channel_stride + j);
-          channel_pixels[1] = vld1q_s32(in_row + (c + 1)*in_channel_stride + j);
-          channel_pixels[2] = vld1q_s32(in_row + (c + 2)*in_channel_stride + j);
-          channel_pixels[3] = vld1q_s32(in_row + (c + 3)*in_channel_stride + j);
-
-          const auto zip1 = vzipq_s32(channel_pixels[0], channel_pixels[2]);
-          const auto zip2 = vzipq_s32(channel_pixels[1], channel_pixels[3]);
-          const auto out_0 = vzipq_s32(zip1.val[0], zip2.val[0]);
-          const auto out_1 = vzipq_s32(zip1.val[1], zip2.val[1]);
-
-          vst1q_s32(out_row + (j + 0)*out_col_stride + c, out_0.val[0]);
-          vst1q_s32(out_row + (j + 1)*out_col_stride + c, out_0.val[1]);
-          vst1q_s32(out_row + (j + 2)*out_col_stride + c, out_1.val[0]);
-          vst1q_s32(out_row + (j + 3)*out_col_stride + c, out_1.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 4; _j++)
-          {
-            const T* const in_col = in_row + j + _j;
-            T* const out_col = out_row + (j + _j)*out_col_stride;
-            const T* const in_channel = in_col + c*in_channel_stride;
-            out_col[c] = *(in_channel);
-          }
-        }
-      }
-      for (; j_remaining >= 2; j += 2, j_remaining -= 2)
-      {
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 2; c += 2, c_remaining -= 2)
-        {
-          // Read 2 channels worth of 2 columns, then zip to produce 2 columns
-          // worth of 2 channels.
-          int32x2_t channel_pixels[2];
-          channel_pixels[0] = vld1_s32(in_row + (c + 0)*in_channel_stride + j);
-          channel_pixels[1] = vld1_s32(in_row + (c + 1)*in_channel_stride + j);
-
-          const auto output = vzip_s32(channel_pixels[0], channel_pixels[1]);
-
-          vst1_s32(out_row + (j + 0)*out_col_stride + c, output.val[0]);
-          vst1_s32(out_row + (j + 1)*out_col_stride + c, output.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 2; _j++)
-          {
-            const T* const in_col = in_row + j + _j;
-            T* const out_col = out_row + (j + _j)*out_col_stride;
-            const T* const in_channel = in_col + c*in_channel_stride;
-            out_col[c] = *(in_channel);
-          }
-        }
-      }
-#endif  // __arm_any__
-      for (; j_remaining; j++, j_remaining--)
-      {
-        const T* const in_col = in_row + j;
-        T* const out_col = out_row + j*out_col_stride;
-
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_col + c*in_channel_stride;
-          out_col[c] = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-template <>
-inline void nchw_to_nhwc(
-  const uint32_t* const in,
-  uint32_t* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  nchw_to_nhwc(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_channels, n_rows, n_cols,
-    in_batch_stride, in_channel_stride, in_row_stride,
-    out_batch_stride, out_row_stride, out_col_stride
-  );
-}
-
-template <>
-inline void nchw_to_nhwc(
-  const float* const in,
-  float* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  nchw_to_nhwc(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_channels, n_rows, n_cols,
-    in_batch_stride, in_channel_stride, in_row_stride,
-    out_batch_stride, out_row_stride, out_col_stride
-  );
-}
-
-/*****************************************************************************/
-/* Generic implementation : NCHW -> NHWC
- */
-template <typename T>
-inline void nchw_to_nhwc(
-  const T* const in,
-  T* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  // Fill in the stride values
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
-  in_channel_stride = (in_channel_stride) ? in_channel_stride
-                                          : n_rows * in_row_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_channels * in_channel_stride;
-
-  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_rows * out_row_stride;
-
-  // Perform the re-ordering
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_row = in_batch + i*in_row_stride;
-      T* const out_row = out_batch + i*out_row_stride;
-
-      for (int j = 0; j < n_cols; j++)
-      {
-        const T* const in_col = in_row + j;
-        T* const out_col = out_row + j*out_col_stride;
-
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_col + c*in_channel_stride;
-          out_col[c] = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-/* 32-bit implementation : NHWC -> NCHW
- */
-template <>
-inline void nhwc_to_nchw(
-  const int32_t* const in,  // Input data in NHWC form
-  int32_t* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  typedef int32_t T;
-
-  // Fill in stride values
-  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_rows * in_row_stride;
-
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
-  out_channel_stride = (out_channel_stride) ? out_channel_stride
-                                            : n_rows * out_row_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_channels * out_channel_stride;
-
-  // Perform the re-ordering
-  // For every batch
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    // For every row
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_i = in_batch + i*in_row_stride;
-      T* const out_i = out_batch + i*out_row_stride;
-
-      // For every column, beginning with chunks of 4
-      int j = 0, j_remaining = n_cols;
-#ifdef __arm_any__
-      for (; j_remaining >= 4; j += 4, j_remaining -=4)
-      {
-        // For every channel, beginning with chunks of 4
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 4; c += 4, c_remaining -= 4)
-        {
-          // Read 4 columns worth of 4 channels then zip to produce 4 channels
-          // worth of 4 columns.
-          int32x4_t pixel_channels[4];
-          pixel_channels[0] = vld1q_s32(in_i + (j + 0)*in_col_stride + c);
-          pixel_channels[1] = vld1q_s32(in_i + (j + 1)*in_col_stride + c);
-          pixel_channels[2] = vld1q_s32(in_i + (j + 2)*in_col_stride + c);
-          pixel_channels[3] = vld1q_s32(in_i + (j + 3)*in_col_stride + c);
-
-          const auto zip1 = vzipq_s32(pixel_channels[0], pixel_channels[2]);
-          const auto zip2 = vzipq_s32(pixel_channels[1], pixel_channels[3]);
-          const auto out_0 = vzipq_s32(zip1.val[0], zip2.val[0]);
-          const auto out_1 = vzipq_s32(zip1.val[1], zip2.val[1]);
-
-          vst1q_s32(out_i + j + (c + 0)*out_channel_stride, out_0.val[0]);
-          vst1q_s32(out_i + j + (c + 1)*out_channel_stride, out_0.val[1]);
-          vst1q_s32(out_i + j + (c + 2)*out_channel_stride, out_1.val[0]);
-          vst1q_s32(out_i + j + (c + 3)*out_channel_stride, out_1.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 4; _j++)
-          {
-            const T* const in_j = in_i + (j + _j)*in_col_stride;
-            T* const out_j = out_i + (j + _j);
-
-            const T* const in_channel = in_j + c;
-            T* const out_channel = out_j + c*out_channel_stride;
-            *(out_channel) = *(in_channel);
-          }
-        }
-      }
-      for (; j_remaining >= 2; j += 2, j_remaining -=2)
-      {
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 2; c += 2, c_remaining -= 2)
-        {
-          // Read 2 columns worth of 2 channels then zip to produce 2 channels
-          // worth of 2 columns.
-          int32x2_t pixel_channels[2];
-          pixel_channels[0] = vld1_s32(in_i + (j + 0)*in_col_stride + c);
-          pixel_channels[1] = vld1_s32(in_i + (j + 1)*in_col_stride + c);
-
-          const auto output = vzip_s32(pixel_channels[0], pixel_channels[1]);
-
-          vst1_s32(out_i + j + (c + 0)*out_channel_stride, output.val[0]);
-          vst1_s32(out_i + j + (c + 1)*out_channel_stride, output.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 2; _j++)
-          {
-            const T* const in_j = in_i + (j + _j)*in_col_stride;
-            T* const out_j = out_i + (j + _j);
-
-            const T* const in_channel = in_j + c;
-            T* const out_channel = out_j + c*out_channel_stride;
-            *(out_channel) = *(in_channel);
-          }
-        }
-      }
-#endif  // __arm_any__
-      for (; j_remaining; j++, j_remaining--)
-      {
-        const T* const in_j = in_i + j*in_col_stride;
-        T* const out_j = out_i + j;
-
-        // For every channel
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_j + c;
-          T* const out_channel = out_j + c*out_channel_stride;
-          *(out_channel) = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-template <>
-inline void nhwc_to_nchw(
-  const uint32_t* const in,  // Input data in NHWC form
-  uint32_t* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  // Redirect to generic 32-bit implementation
-  nhwc_to_nchw(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_rows, n_cols, n_channels,
-    in_batch_stride, in_row_stride, in_col_stride,
-    out_batch_stride, out_channel_stride, out_row_stride
-  );
-}
-
-template <>
-inline void nhwc_to_nchw(
-  const float* const in,  // Input data in NHWC form
-  float* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  // Redirect to generic 32-bit implementation
-  nhwc_to_nchw(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_rows, n_cols, n_channels,
-    in_batch_stride, in_row_stride, in_col_stride,
-    out_batch_stride, out_channel_stride, out_row_stride
-  );
-}
-
-/*****************************************************************************/
-/* Generic implementation : NHWC -> NCHW
- */
-template <typename T>
-inline void nhwc_to_nchw(
-  const T* const in,  // Input data in NHWC form
-  T* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  // Fill in stride values
-  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_rows * in_row_stride;
-
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
-  out_channel_stride = (out_channel_stride) ? out_channel_stride
-                                            : n_rows * out_row_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_channels * out_channel_stride;
-
-  // Perform the re-ordering
-  // For every batch
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    // For every row
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_i = in_batch + i*in_row_stride;
-      T* const out_i = out_batch + i*out_row_stride;
-
-      // For every column
-      for (int j = 0; j < n_cols; j++)
-      {
-        const T* const in_j = in_i + j*in_col_stride;
-        T* const out_j = out_i + j;
-
-        // For every channel
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_j + c;
-          T* const out_channel = out_j + c*out_channel_stride;
-          *(out_channel) = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-/* Generic weight re-order implementation.
- */
-template <typename T>
-inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
-  const T* const in,  // Input in [Output x Input x Height x Width] form
-  T* const out,       // Output in [Height x Width x Input x Output] form
-  const int n_output_feature_maps,
-  const int n_input_feature_maps,
-  const int n_rows,
-  const int n_cols,
-  int in_output_feature_map_stride,
-  int in_input_feature_map_stride,
-  int in_row_stride,
-  int out_row_stride,
-  int out_col_stride,
-  int out_input_feature_map_stride
-)
-{
-  // Fill in stride values
-  in_row_stride = (in_row_stride)
-    ? in_row_stride
-    : n_cols;
-  in_input_feature_map_stride = (in_input_feature_map_stride)
-    ? in_input_feature_map_stride
-    : n_rows * in_row_stride;
-  in_output_feature_map_stride = (in_output_feature_map_stride)
-    ? in_output_feature_map_stride
-    : n_input_feature_maps * in_input_feature_map_stride;
-
-  out_input_feature_map_stride = (out_input_feature_map_stride)
-    ? out_input_feature_map_stride
-    : n_output_feature_maps;
-  out_col_stride = (out_col_stride)
-    ? out_col_stride
-    : n_input_feature_maps * out_input_feature_map_stride;
-  out_row_stride = (out_row_stride)
-    ? out_row_stride
-    : n_cols * out_col_stride;
-
-  // Perform the re-ordering
-  for (int i = 0; i < n_rows; i++)
-  {
-    const T* const in_row = in + i * in_row_stride;
-    T* out_row = out + i * out_row_stride;
-
-    for (int j = 0; j < n_cols; j++)
-    {
-      const T* const in_col = in_row + j;
-      T* const out_col = out_row + j * out_col_stride;
-
-      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
-      {
-        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
-        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
-
-        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
-        {
-          const T* const in_ofm = in_ifm + ofm * in_output_feature_map_stride;
-          T* const out_ofm = out_ifm + ofm;
-          *(out_ofm) = *(in_ofm);
-        }
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-/* Generic weight re-order implementation.
- */
-template <typename T>
-inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
-  const T* const in,  // Input in [Height x Width x Input x Output] form
-  T* const out,       // Output in [Output x Input x Height x Width] form
-  const int n_rows,
-  const int n_cols,
-  const int n_input_feature_maps,
-  const int n_output_feature_maps,
-  int in_row_stride,
-  int in_col_stride,
-  int in_input_feature_map_stride,
-  int out_output_feature_map_stride,
-  int out_input_feature_map_stride,
-  int out_row_stride
-)
-{
-  // Fill in the stride values
-  in_input_feature_map_stride = (in_input_feature_map_stride)
-    ? in_input_feature_map_stride
-    : n_output_feature_maps;
-  in_col_stride = (in_col_stride)
-    ? in_col_stride
-    : n_input_feature_maps * in_input_feature_map_stride;
-  in_row_stride = (in_row_stride)
-    ? in_row_stride
-    : n_cols * in_col_stride;
-
-  out_row_stride = (out_row_stride)
-    ? out_row_stride
-    : n_cols;
-  out_input_feature_map_stride = (out_input_feature_map_stride)
-    ? out_input_feature_map_stride
-    : n_rows * out_row_stride;
-  out_output_feature_map_stride = (out_output_feature_map_stride)
-    ? out_output_feature_map_stride
-    : n_input_feature_maps * out_input_feature_map_stride;
-
-  // Perform the re-ordering
-  for (int i = 0; i < n_rows; i++)
-  {
-    const T* const in_row = in + i * in_row_stride;
-    T* const out_row = out + i * out_row_stride;
-
-    for (int j = 0; j < n_cols; j++)
-    {
-      const T* const in_col = in_row + j * in_col_stride;
-      T* const out_col = out_row + j;
-
-      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
-      {
-        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
-        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
-
-        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
-        {
-          const T* const in_ofm = in_ifm + ofm;
-          T* const out_ofm = out_ifm + ofm * out_output_feature_map_stride;
-          *(out_ofm) = *(in_ofm);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace reorder
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
deleted file mode 100644
index ad0a677a8f..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include <cstdlib>
-#include <random>
-
-#include "alloc.hpp"
-
-enum TensorOrder
-{
-  NHWC,  ///< [Batch x Height x Width x Channels]
-  NCHW,  ///< [Batch x Channels x Height x Width]
-};
-
-struct Tensor4DShape
-{
-  int n_batches, n_rows, n_cols, n_channels;
-  TensorOrder ordering;
-
-  // Create a new tensor with the default (NHWC) ordering
-  inline Tensor4DShape(
-    const int n_batches,
-    const int n_rows,
-    const int n_cols,
-    const int n_channels,
-    const TensorOrder ordering=NHWC
-  ) : n_batches(n_batches),
-      n_rows(n_rows),
-      n_cols(n_cols),
-      n_channels(n_channels),
-      ordering(ordering)
-  {
-  }
-
-  inline int index(const int n, const int i, const int j, const int c) const
-  {
-    if (this->ordering == NHWC)
-    {
-      return ((n*this->n_rows + i)*this->n_cols + j)*this->n_channels + c;
-    }
-    else  // NCHW
-    {
-      return ((n*this->n_channels + c)*this->n_rows + i)*this->n_cols + j;
-    }
-  }
-
-  inline int size() const
-  {
-    return n_batches * n_rows * n_cols * n_channels;
-  }
-
-  inline bool TestEq(const Tensor4DShape& other) const
-  {
-    return (n_batches == other.n_batches &&
-            n_rows == other.n_rows &&
-            n_cols == other.n_cols &&
-            n_channels == other.n_channels);
-  }
-};
-
-
-enum WeightOrder
-{
-  HWIO,  ///< [Height x Width x Input channels x Output channels]
-  OIHW,  ///< [Output channels x Input channels x Height x Width]
-};
-
-struct KernelShape
-{
-  int n_output_channels, n_rows, n_cols, n_input_channels;
-  WeightOrder ordering;
-
-  inline KernelShape(
-    const int n_output_channels,
-    const int n_rows,
-    const int n_cols,
-    const int n_input_channels,
-    const WeightOrder ordering=HWIO
-  ) : n_output_channels(n_output_channels),
-      n_rows(n_rows),
-      n_cols(n_cols),
-      n_input_channels(n_input_channels),
-      ordering(ordering)
-  {
-  }
-
-  inline int index(int oc, int i, int j, int ic) const
-  {
-    if (this->ordering == HWIO)
-    {
-      return ((i*this->n_cols + j)*this->n_input_channels + ic)*this->n_output_channels + oc;
-    }
-    else  // OIHW
-    {
-      return ((oc*this->n_input_channels + ic)*this->n_rows + i)*this->n_cols + j;
-    }
-  }
-
-  inline int size(void) const
-  {
-    return n_output_channels * n_rows * n_cols * n_input_channels;
-  }
-};
-
-
-template <typename ShapeT, typename T>
-class Tensor4D final
-{
-  public:
-    Tensor4D(ShapeT shape) :
-      shape(shape),
-      _data(reinterpret_cast<T*>(ALLOCATE(size_bytes())))
-    {
-        Clear();
-    }
-
-    Tensor4D(const Tensor4D<ShapeT, T>&) = delete;
-    Tensor4D operator=(const Tensor4D<ShapeT, T>&) = delete;
-
-    ~Tensor4D() {
-      free(_data);
-    }
-
-    inline T* ptr() const {
-      return _data;
-    }
-
-    inline size_t size_bytes() const {
-      return shape.size() * sizeof(T);
-    }
-
-    /* Extract an element of the tensor.
-     *
-     * If the shape is a Tensor4DShape then the index is given as batch, row,
-     * column and channel.  If the shape is a KernelShape then the index is
-     * given as output channel, row, column and input channel.
-     */
-    inline T& element(const int a, const int b, const int c, const int d) const
-    {
-      return _data[shape.index(a, b, c, d)];
-    }
-
-    inline void Clear() {
-      Fill(static_cast<T>(0));
-    }
-
-    inline void Fill(T val) {
-      for (int i = 0; i < shape.size(); i++)
-        _data[i] = val;
-    }
-
-    const ShapeT shape;
-
-  private:
-    T* const _data;
-};
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp
deleted file mode 100644
index 0c234431b1..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "tensor.hpp"
-
-// Methods to print tensors and weights
-void PrintTensor(const Tensor4D<Tensor4DShape, float>& tensor);
-void PrintWeights(const Tensor4D<KernelShape, float>& weights);
-
-// Test the equivalence of two tensors
-// Counts the instances that |a - b|/|a| > max_err
-bool CmpTensors(
-  const Tensor4D<Tensor4DShape, float>& a,
-  const Tensor4D<Tensor4DShape, float>& b,
-  const float max_err=0.0f
-);
-
-// Fill the tensor with a test pattern
-void TestPattern(Tensor4D<Tensor4DShape, float>& tensor);
-void TestPattern(Tensor4D<KernelShape, float>& weights);
-
-// Fill the tensor with random values
-void Randomise(Tensor4D<Tensor4DShape, float>& tensor, const int seed=0);
-void Randomise(Tensor4D<KernelShape, float>& weights, const int seed=0);
diff --git a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
deleted file mode 100644
index 99b2282f7e..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <limits>
-
-void PrintMatrix(const float *const m, const int M, const int N, const int row_stride);
-
-constexpr inline int iceildiv(const int a, const int b)
-{
-    return (a + b - 1) / b;
-}
-
-template <typename T>
-inline T roundup(const T a, const T b)
-{
-    return b * iceildiv(a, b);
-}
-
-template<typename T>
-struct TypeBounds
-{
-    static constexpr T lower() noexcept { return std::numeric_limits<T>::has_infinity
-                                                 ? -std::numeric_limits<T>::infinity()
-                                                 : std::numeric_limits<T>::lowest(); };
-    static constexpr T upper() noexcept { return std::numeric_limits<T>::has_infinity
-                                                 ? std::numeric_limits<T>::infinity()
-                                                 : std::numeric_limits<T>::max(); };
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template<>
-struct TypeBounds<__fp16>
-{
-    static constexpr __fp16 lower() noexcept { return -std::numeric_limits<float>::infinity(); };
-    static constexpr __fp16 upper() noexcept { return std::numeric_limits<float>::infinity(); }
-};
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
deleted file mode 100644
index a4a833d90a..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <arm_neon.h>
-#include "activation.hpp"
-#include "padding.hpp"
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-class IDepthwiseConvolution
-{
-  public:
-    virtual ~IDepthwiseConvolution() = default;
-
-    virtual int output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after
-    ) const = 0;
-
-    /* Set input tensor and stride. */
-    virtual void set_input(const void *inptr) = 0;
-    virtual void set_input(const void *inptr, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Set output tensor and stride. */
-    virtual void set_output(void *outptr) = 0;
-    virtual void set_output(void *outptr, int column_stride) = 0;
-    virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
-    virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    virtual size_t get_packed_params_size(void) const = 0;
-    virtual void set_packed_params_buffer(void *) = 0;
-
-    virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const = 0;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-    virtual void set_working_space(void *) = 0;
-
-    virtual unsigned int get_window(void) const = 0;
-    virtual void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) = 0;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut,
-  typename Derived
->
-class DepthwiseConvolutionBase : public IDepthwiseConvolution
-{
-  public:
-    // Information about the specific convolution instance
-    using InputType = TIn;
-    using BiasType = TBias;
-    using OutputType = TOut;
-    static constexpr int output_tile_rows = OutputTileRows;
-    static constexpr int output_tile_cols = OutputTileCols;
-    static constexpr int kernel_rows = KernelRows;
-    static constexpr int kernel_cols = KernelCols;
-    static constexpr int stride_rows = StrideRows;
-    static constexpr int stride_cols = StrideCols;
-    static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows;
-    static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols;
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DepthwiseConvolution.
-    DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
-    DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /** Get the number of output rows/columns.
-     *
-     * @param[in] dim_size Number of elements in the dimension (rows/columns)
-     * @param[in] same_padding True if the padding is SAME, otherwise false.
-     */
-    static int get_output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Determine how much memory is required to store the packed weights and
-     * biases.
-     */
-    size_t get_packed_params_size(void) const override;
-
-    /* Set the buffer for the packed weights and biases, and perform the
-     * packing.
-     */
-    void set_packed_params_buffer(void *buffer) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      const void *biases=nullptr
-    ) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /** Query the amount of working space required.
-     * @param[in] The largest number of threads which will be used to execute
-     *            the kernel.
-     */
-    size_t get_working_space_size(unsigned int n_threads=1) const override;
-
-    /** Set the working space buffer.
-     */
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work to be performed by an instance of the operator.
-     */
-    unsigned int get_window(void) const override;
-
-    /** Perform a portion of the work associated with the operator.
-     *
-     * Will perform the window of work described by $[start, stop)$.
-     *
-     * @param[in] start Start of the window of work to perform.
-     * @param[in] stop End of the work to perform.
-     * @param[in] ID of the thread performing the work.
-     */
-    void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) override;
-
-  protected:
-    /** Get the value to use to pad the tensor.
-     */
-    TIn _input_padding_value(void) const;
-
-    /** Implementation of the parameter packing.
-     */
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    /** Process a tile-row of the tensors.
-     */
-    void process_tile_row(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int row_pad_in_top,
-      int row_pad_in_left,
-      int row_pad_in_bottom,
-      int row_pad_out_bottom,
-      int n_tiles,
-      int n_input_cols,
-      int n_output_cols
-    );
-
-    /** Process a single tile of the tensor.
-     *
-     * This method will apply input/output padding (if required) and call the
-     * depthwise tile implementation.
-     */
-    void process_tile(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int pad_in_top,
-      int pad_in_left,
-      int pad_in_bottom,
-      int pad_in_right,
-      int pad_out_bottom,
-      int pad_out_right
-    );
-
-    /** Perform depthwise convolution on a single tile.
-     */
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      OutputType* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[inner_tile_rows][inner_tile_cols],
-      OutputType* outptrs[output_tile_rows][output_tile_cols]
-    );
-
-    int n_channels(void) const;
-
-  private:
-    // Member variables of instances of a convolution engine.
-    const InputType* _input;
-    OutputType* _output;
-    void* _packed_parameters;
-    void* _working_space;  // Per-thread working space
-    const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
-              _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
-    const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
-    const nck::ActivationFunction _activation;
-
-    // Stride information for a convolution instance
-    int _input_col_stride, _input_row_stride, _input_batch_stride;
-    int _output_col_stride, _output_row_stride, _output_batch_stride;
-
-    // Methods for getting access to working space
-    size_t _get_input_working_space_size(void) const;
-    size_t _get_output_working_space_size(void) const;
-
-    void *_get_input_working_space(unsigned int threadid) const;
-    void *_get_output_working_space(unsigned int threadid) const;
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  TIn, TBias, TOut,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    using Base::DepthwiseConvolutionBase;
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const TIn* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      TOut* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float, float, float
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float16_t, float16_t, float16_t
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float16_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
deleted file mode 100644
index e0d7f0c7f1..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <deque>
-#include <functional>
-#include <memory>
-
-#include "depthwise.hpp"
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DilatedDepthwiseConvolution : public IDepthwiseConvolution
-{
-  public:
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DilatedDepthwiseConvolution.
-    DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
-    DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    static int get_output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after,
-      int dilation_factor
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    size_t get_packed_params_size(void) const override;
-    void set_packed_params_buffer(void *) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-    void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
-    void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *) override;
-
-    unsigned int get_window(void) const override;
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    /** Protected constructor which also accepts a function to construct a new
-     * subconvolution
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
-    );
-
-    const int _dilation_factor;
-    const int _n_input_rows, _n_input_cols, _n_channels;
-    const int _padding_top, _padding_left;
-    const int _n_output_rows, _n_output_cols;
-
-    /* Dilated depthwise convolution is performed through repeated calls to
-     * non-dilated convolutions. If the dilation factor is $n$, then we perform
-     * $(n + 1)^2$ depthwise convolutions.
-     */
-    using BaseDepthwise = DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-    >;
-    std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
-};
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
deleted file mode 100644
index 37c1f1bc84..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise.hpp"
-#include "qasymm8.hpp"
-#include "qsymm8.hpp"
-#pragma once
-
-using namespace neon_convolution_kernels;
-using namespace qasymm8;
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
-{
-  return vqrdmulhq_s32(a, b);
-}
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
-{
-  return vqrdmulhq_n_s32(a, b);
-}
-
-inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
-{
-  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
-{
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
-{
-  const int32x4_t shift = vdupq_n_s32(-exponent);
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
-{
-  const int32x2_t shift = vdup_n_s32(-exponent);
-  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
-  const int32x2_t fixed = vqadd_s32(x, fixup);
-  return vrshl_s32(fixed, shift);
-}
-
-inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
-{
-  const int32x2_t xs = vdup_n_s32(x);
-  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
-}
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
-    const qasymm8::QAsymm8RescaleParams rescale_parameters;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  size_t get_packed_params_size(void) const override
-  {
-      return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
-
-  }
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qsymm8::QSymm8PerChannelParams _weights_quant;
-    const qasymm8::QAsymm8Params _input_quant, _output_quant;
-    const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
-};
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
deleted file mode 100644
index cf1c6f581f..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise_dilated.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-class QAsymm8DilatedDepthwiseConvolution
-    : public DilatedDepthwiseConvolution<
-          OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-          StrideCols, uint8_t, int32_t, uint8_t> {
-public:
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-};
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp
deleted file mode 100644
index bc0d9d4296..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-#include <cstddef>
-#include <utility>
-
-namespace winograd
-{
-
-class ITransform
-{
-  public:
-    virtual ~ITransform() = default;
-
-    /**
-     * Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param nthreads The greatest number of threads that will be used to execute the transform.
-     * @return Size of working space required in bytes.
-     */
-    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-
-    /**
-     * Set the working space to be used by the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param Pointer to the working space.
-     */
-    virtual void set_working_space(void *buffer) = 0;
-
-    /**
-     * Get the window of work a given operator can perform.
-     */
-    virtual unsigned int get_window() const = 0;
-
-    /**
-     * Perform work upon a window of the transform.
-     */
-    virtual void run(unsigned int start, unsigned int stop, unsigned int threadid=0) = 0;
-};
-
-class IInputTransform : public ITransform
-{
-  public:
-    virtual ~IInputTransform() = default;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     */
-    virtual void set_input_tensor(const void *input) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int col_stride) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the transformed input.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-class IOutputTransform : public ITransform
-{
-  public:
-    virtual ~IOutputTransform() = default;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the input to the transform.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_input_matrices(const void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-
-    /**
-     * Set pointer to the bias tensor (can be ignored or called with nullptr for no bias.
-     */
-    virtual void set_bias(const void *bias=nullptr) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     */
-    virtual void set_output_tensor(void *output) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int col_stride) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) = 0;
-};
-
-class IWeightTransform : public ITransform
-{
-  public:
-    virtual ~IWeightTransform() = default;
-
-    /** Set pointer to the weight tensor read by the transform. */
-    virtual void set_weight_tensor(const void *weights) = 0;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the transformed input.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-enum class WinogradRoots
-{
-  Integers,
-};
-
-template <int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform : public IInputTransform
-{
-  public:
-    /** Create an InputTransform operator fixed on a given problem and set of
-     * pointers.
-     */
-    InputTransform(
-        int kernel_rows,     /**< Number of rows in the kernel */
-        int kernel_cols,     /**< Number of columns in the kernel */
-        int n_batches,       /**< Number of batches in input tensor. */
-        int n_rows,          /**< Number of rows in input tensor. */
-        int n_cols,          /**< Number of columns in input tensor. */
-        int n_channels,      /**< Number of channels in input tensor. */
-        int padding_top,     /**< Padding to apply to the top of the image. */
-        int padding_left,    /**< Padding to apply to the left of the image. */
-        int padding_bottom,  /**< Padding to apply to the bottom of the image. */
-        int padding_right    /**< Padding to apply to the right of the image. */
-    );
-
-    InputTransform(InputTransform&) = delete;
-    InputTransform operator=(InputTransform&) = delete;
-
-    /** Set pointers to the input tensor read by the transform. */
-    void set_input_tensor(const void *input) override;
-    void set_input_tensor(const void *input, int col_stride) override;
-    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
-    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-
-    /** Set pointers to the matrices written by the transform. */
-    void set_output_matrices(void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    const int _n_batches, _n_rows, _n_cols, _n_channels;
-
-  private:
-    void transform_unpadded_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr
-    );
-
-    void transform_padded_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      int padding_top,
-      int padding_left,
-      int padding_bottom,
-      int padding_right
-    );
-    
-    /* Tile implementation */
-    static void transform_tile(
-      int n_channels,         /** @param[in] Number of channels in the tensor. */
-      const TIn* inptr_base,  /** @param[in] Pointer to the base of the input tile. */
-      int input_row_stride,   /** @param[in] Stride between rows of the input tensor. */
-      int input_col_stride,   /** @param[in] Stride between columns of the input tensor. */
-      TOut* mptr_base,        /** @param[out] Base pointer to transformed input matrices. */
-      int matrix_stride       /** @param[in] Stride between matrices in the input space. */
-    );
-
-    /** Get the working space for a thread. */
-    void * get_working_space(unsigned int threadid) const;
-
-    const TIn* _inptr;
-    TOut* _outptr;
-
-    const int _overlap_rows, _overlap_cols;
-    const int _padding_top, _padding_left, _padding_bottom, _padding_right;
-    const int _tiles_M, _tiles_N;
-    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
-    int _in_col_stride, _in_row_stride, _in_batch_stride;
-
-    const int _working_space_col_stride, _working_space_row_stride;
-    TIn *_working_space;
-};
-
-template <int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform<InnerTileRows, 1, TIn, TOut, Roots> :
-  public InputTransform<1, InnerTileRows, TIn, TOut, Roots>
-{
-  using Base = InputTransform<1, InnerTileRows, TIn, TOut, Roots>;
-
-  public:
-    InputTransform(
-      int kernel_rows,     /**< Number of rows in the kernel. */
-      int kernel_cols,     /**< Number of columns in the kernel. */
-      int n_batches,       /**< Number of batches in input tensor. */
-      int n_rows,          /**< Number of rows in input tensor. */
-      int n_cols,          /**< Number of columns in input tensor. */
-      int n_channels,      /**< Number of channels in input tensor. */
-      int padding_top,     /**< Padding to apply to the top of the image. */
-      int padding_left,    /**< Padding to apply to the left of the image. */
-      int padding_bottom,  /**< Padding to apply to the bottom of the image. */
-      int padding_right    /**< Padding to apply to the right of the image. */
-    );
-
-    /** Set pointers to the input tensor read by the transform. */
-    void set_input_tensor(const void *input) override;
-    void set_input_tensor(const void *input, int col_stride) override;
-    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
-    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
-  int KernelRows, int KernelCols,
-  int InnerTileRows, int InnerTileCols,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class OutputTransform : public IOutputTransform
-{
-  public:
-    OutputTransform(
-      int n_batches,  /**< Number of batches in output tensor. */
-      int n_rows,     /**< Number of rows in output tensor. */
-      int n_cols,     /**< Number of columns in output tensor. */
-      int n_channels, /**< Number of channels in output tensor. */
-      const arm_gemm::Activation &activation
-    );
-
-    OutputTransform(OutputTransform&) = delete;
-    OutputTransform operator=(OutputTransform&) = delete;
-
-    /** Set pointers to the matrices read by the transform. */
-    void set_input_matrices(const void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
-    /** Set pointer to the bias tensor (can be ignored or called with nullptr for no bias */
-    void set_bias(const void *bias=nullptr) override;
-
-    /** Set pointers to the output tensor written by the transform. */
-    void set_output_tensor(void *output) override;
-    void set_output_tensor(void *output, int col_stride) override;
-    void set_output_tensor(void *output, int row_stride, int col_stride) override;
-    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    static constexpr int inner_tile_rows = InnerTileRows;
-    static constexpr int inner_tile_cols = InnerTileCols;
-    static constexpr int output_tile_rows = InnerTileRows - KernelRows + 1;
-    static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1;
-
-    const int _n_batches, _n_rows, _n_cols, _n_channels;
-    const TOut _output_min, _output_max;
-
-  private:
-    void transform_uncropped_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      const TOut *biases
-    );
-
-    void transform_cropped_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      const TOut *biases,
-      int pad_bottom,
-      int pad_right
-    );
-
-    /** Implementation of the tile transformation method. */
-    static void transform_tile(
-      int n_channels,
-      const TIn* matrix_base,
-      int matrix_stride,
-      const TOut* biases,
-      TOut* output,
-      int output_row_stride,
-      int output_col_stride,
-      TOut output_min,
-      TOut output_max
-    );
-
-    /** Get the working space for a thread. */
-    void * get_working_space(unsigned int threadid) const;
-
-    const TIn* _matrix_base;
-    const TOut* _biases;
-    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
-    TOut* _outptr;
-    const int _tiles_M, _tiles_N;
-    int _out_col_stride, _out_row_stride, _out_batch_stride;
-
-    const int _working_space_col_stride, _working_space_row_stride;
-    TOut *_working_space;
-};
-
-template <
-  int KernelRows,
-  int InnerTileRows,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
-  public OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
-  using Base = OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>;
-
-  public:
-    OutputTransform(
-      int n_batches,  /**< Number of batches in output tensor. */
-      int n_rows,     /**< Number of rows in output tensor. */
-      int n_cols,     /**< Number of columns in output tensor. */
-      int n_channels, /**< Number of channels in output tensor. */
-      const arm_gemm::Activation &activation
-    );
-
-    /** Set pointers to the output tensor written by the transform. */
-    void set_output_tensor(void *output) override;
-    void set_output_tensor(void *output, int col_stride) override;
-    void set_output_tensor(void *output, int row_stride, int col_stride) override;
-    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
-  int KernelRows, int KernelCols,
-  int InnerTileRows, int InnerTileCols,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class WeightTransform : public IWeightTransform
-{
-  public:
-    WeightTransform(
-      int n_output_channels,  /**< Number of output channels in the kernel. */
-      int n_input_channels    /**< Number of input channels in the kernel. */
-    );
-
-    WeightTransform(WeightTransform&) = delete;
-    WeightTransform operator=(WeightTransform&) = delete;
-
-    /** Set pointer to the weight tensor read by the transform. */
-    void set_weight_tensor(const void *weights) override;
-
-    /** Set pointer to the matrices written by the transform. */
-    void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    static const int kernel_rows = KernelRows;
-    static const int kernel_cols = KernelCols;
-    static const int inner_tile_rows = InnerTileRows;
-    static const int inner_tile_cols = InnerTileCols;
-
-  private:
-    /** Apply the transform to a tensor. */
-    static void execute(
-      int n_output_channels,
-      int n_input_channels,
-      const TIn* input,
-      TOut* output,
-      int matrix_stride,
-      int matrix_row_stride
-    );
-
-    const int _n_output_channels, _n_input_channels;
-    TOut *_matrices;
-    int _matrix_stride, _matrix_row_stride;
-    const TIn *_weights;
-};
-
-template <int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class WeightTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
-  public WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
-  public:
-    using WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::WeightTransform;
-};
-
-template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, WinogradRoots Roots>
-class WinogradGEMM
-{
-  public:
-    // Information about the specific Winograd instance
-    static constexpr int output_tile_rows = OutputTileRows;
-    static constexpr int output_tile_cols = OutputTileCols;
-    static constexpr int kernel_rows = KernelRows;
-    static constexpr int kernel_cols = KernelCols;
-    static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1;
-    static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1;
-    static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols;
-
-    /** Transform weights from the spatial to the Winograd domain. */
-    template <typename TIn, typename TOut>
-    using WeightsTransform = WeightTransform<
-      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
-      TIn, TOut, Roots
-    >;
-
-    /** Transform input feature maps from the spatial to the Winograd domain.
-     */
-    template <typename TIn, typename TOut>
-    using InputTransform = InputTransform<
-      inner_tile_rows, inner_tile_cols, TIn, TOut, Roots
-    >;
-
-    /** Transform output feature maps from the Winograd to the spatial domain.
-     */
-    template <typename TIn, typename TOut>
-    using OutputTransform = OutputTransform<
-      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
-      TIn, TOut, Roots
-    >;
-
-    /** Perform a convolution.
-     */
-    template <typename TOut, typename TIn, typename TInGEMM=TIn, typename TOutGEMM=TOut>
-    class Convolution
-    {
-      public:
-        // Information about the typed Winograd instance
-        typedef TOut OutputType;
-        typedef TOutGEMM GemmOutputType;
-        typedef TInGEMM GemmInputType;
-        typedef TIn InputType;
-
-        /** Get the output shape of a convolution. */
-        static std::pair<unsigned int, unsigned int> get_output_shape(
-            const std::pair<unsigned int, unsigned int> input_shape,
-            bool padding_same);
-
-        /** Get the memory required to store the kernel transformed into the
-         * Winograd domain.
-         */
-        static size_t get_kernel_storage_size(unsigned int n_input_channels,
-                                              unsigned int n_output_channels);
-
-        /** Get the memory required to store the input tensor transformed into
-         * the Winograd domain.
-         */
-        static size_t get_input_storage_size(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        /** Get the memory required to store the output tensor in the Winograd
-         * domain.
-         */
-        static size_t get_output_storage_size(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        /** Get the memory required to apply a Winograd operator to some input.
-         */
-        static size_t get_working_space_size(
-            unsigned int n_batches,
-            unsigned int n_rows,            // Number of input rows
-            unsigned int n_cols,            // Number of input columns
-            unsigned int n_input_channels,  // Number of input channels
-            unsigned int n_output_channels, // Number of output channels
-            bool padding_same);
-
-        /* Get the memory required by a single "input" matrix.
-         */
-        static size_t get_input_matrix_size(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        static int get_input_matrix_stride(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        /* Get the memory required by a single "output" matrix.
-         */
-        static size_t get_output_matrix_size(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        static int get_output_matrix_stride(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        /* Get the memory required by a single "kernel" matrix.
-         */
-        static size_t get_kernel_matrix_size(unsigned int n_input_channels,
-                                             unsigned int n_output_channels);
-        static int get_kernel_matrix_stride(unsigned int n_input_channels,
-                                            unsigned int n_output_channels);
-
-        static constexpr int M_BLOCK = 4;   /** Size of block used by GEMM. */
-        static constexpr int N_BLOCK = 16;  /** Size of block used by GEMM. */
-    };
-};
-
-}  // namespace winograd
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
deleted file mode 100644
index ed8fede385..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "arm_gemm_local.hpp"
-#include "arm_gemm.hpp"
-#include "winograd.hpp"
-
-namespace winograd
-{
-
-
-class IWinogradConvolutionLayer
-{
-  public:
-    virtual ~IWinogradConvolutionLayer() = default;
-
-    virtual unsigned int weight_transform_get_window(void) const = 0;
-    virtual void weight_transform_run(unsigned int start, unsigned int stop) = 0;
-
-    virtual IInputTransform& input_transform(void) = 0; // Expose the input transform
-    virtual IOutputTransform& output_transform(void) = 0;  // Expose the output transform
-    virtual arm_gemm::IGemmCommon *gemm(void) = 0;  // Expose the underlying GEMM
-};
-
-/** Example of how to construct an ACL-like interface.
- *
- * Use `get_weight_storage_size`, `get_input_storage_size` and
- * `get_output_storage_size` to allocate memory for the convolution engine.
- * Then create a `WinogradConvolutionLayer`.
- *
- * Initialise the weights using `weights_transform.run(...)`.
- *
- * For each inference:
- *   1. Transform the inputs to the Winograd domain using `input_transform.run(...)`
- *   2. Perform a number of GEMMs using `gemms.run(...)`
- *   3. Transform the output to the spatial domain using `output_transform.run(...)`
- */
-template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-          typename TIn, typename TInGEMM, typename TOutGEMM, typename TOut,
-          WinogradRoots Roots>
-class WinogradConvolutionLayer : public IWinogradConvolutionLayer
-{
-  public:
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, Roots>;
-    using WeightsTransform = typename WinogradBase::template WeightsTransform<TIn, TInGEMM>;
-    using InputTransform = typename WinogradBase::template InputTransform<TIn, TInGEMM>;
-    using WinogradConv = typename WinogradBase::template Convolution<TOut, TIn, TInGEMM, TOutGEMM>;
-    using OutputTransform = typename WinogradBase::template OutputTransform<TOutGEMM, TOut>;
-
-  private:
-    static constexpr int InnerTileRows = OutputTileRows + KernelRows - 1;
-    static constexpr int InnerTileCols = OutputTileCols + KernelCols - 1;
-    static constexpr int N_GEMMS = InnerTileRows * InnerTileCols;
-
-    const int _n_output_rows, _n_output_cols;
-    const int _kernel_matrix_stride, _kernel_matrix_row_stride;
-    const int _input_matrix_stride, _input_matrix_row_stride;
-    const int _output_matrix_stride, _output_matrix_row_stride;
-    const int _tile_rows, _tile_cols;
-    const int _m, _k, _n;
-
-    WeightsTransform weights_transform;  /** Operator to transform weights to Winograd domain. */
-    InputTransform _input_transform;      /** Operator to transform input to Winograd domain. */
-    const arm_gemm::GemmArgs gemm_args;
-    arm_gemm::UniqueGemmCommon<TInGEMM, TOutGEMM> gemms;    /** Operator to perform multiple GEMMs. */
-    OutputTransform _output_transform;    /** Operator to transform output from Winograd domain. */
-
-  public:
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed weights.
-     */
-    static unsigned int get_weight_storage_size(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    static unsigned int get_weight_stride(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    static unsigned int get_weight_multi_stride(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     */
-    static unsigned int get_input_storage_size(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the row stride for the A matrix in the Winograd domain. */
-    static unsigned int get_input_stride(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the stride between A matrices in the Winograd domain. */
-    static unsigned int get_input_multi_stride(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     */
-    static unsigned int get_output_storage_size(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    static unsigned int get_output_stride(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    static unsigned int get_output_multi_stride(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the shape (rows, cols) of a feature map of the output tensor. */
-    static std::pair<int, int> get_output_feature_map_shape(
-      const int n_input_rows,  /** Number of rows in the input feature map. */
-      const int n_input_cols,  /** Number of columns in the input feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Create a new Winograd convolution layer.
-     */
-    WinogradConvolutionLayer(
-      const arm_gemm::CPUInfo &cpuinfo,       /** Describes CPU properties. */
-      const int n_threads,          /** Maximum number of threads used to execute the convolution. */
-      const int n_batches,          /** Number of batches in the input and output tensors. */
-      const int n_input_channels,   /** Number of feature maps in a batch of the input tensor. */
-      const int n_input_rows,       /** Number of rows in a feature map of the input tensor. */
-      const int n_input_cols,       /** Number of columns in a feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
-      const arm_gemm::Activation &activation,
-      const TIn* const weights,     /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
-      TInGEMM* const weights_storage,  /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
-      const TIn* const input,       /** Pointer to NHWC ordered input tensor, in the spatial domain. */
-      TInGEMM* const winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
-      const TOut* const biases,     /** Pointer to biases vector. Pass nullptr if no bias is provided. */
-      TOut* const output,           /** Pointer to NHWC ordered output tensor, in the spatial domain. */
-      TOutGEMM* const winograd_output,  /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
-      const bool pretranspose_B=true,         /** Hint that the B matrix can be pretransposed. */
-      arm_gemm::GemmConfig *gemm_cfg=nullptr  /** Pointer to GEMM configuration. */
-    );
-
-    /* Utility methods for interacting with the layer. */
-    unsigned int weight_transform_get_window(void) const;
-    void weight_transform_run(const unsigned int start, const unsigned int stop);
-
-    IInputTransform& input_transform(void);
-    IOutputTransform& output_transform(void);
-
-    /* Get a pointer to the GEMM underlying the Winograd transform. */
-    arm_gemm::IGemmCommon *gemm(void);
-};
-
-}
diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
deleted file mode 100644
index 4861559695..0000000000
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
-#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace detail
-{
-/** Dummy activation object */
-template <typename T, int S>
-struct dummy
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-
-    /** Construct a dummy activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit dummy(ActivationLayerInfo act_info)
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        ARM_COMPUTE_UNUSED(vval);
-    }
-};
-/** Linear activation object */
-template <typename T, int S>
-struct linear
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a Linear activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit linear(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
-          vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
-    {
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmla(vval, valpha, vbeta);
-    }
-
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
-};
-/** Square activation object */
-template <typename T, int S>
-struct square
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a Square activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit square(ActivationLayerInfo act_info)
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmul(vval, vval);
-    }
-};
-/** Logistic activation object */
-template <typename T, int S>
-struct logistic
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a Logistic activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}))
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval))));
-    }
-
-    /** Vector of ones. */
-    const ExactType vone;
-};
-/** RELU activation object */
-template <typename T, int S>
-struct relu
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a RELU activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}))
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmax(vzero, vval);
-    }
-
-    /** Vector of zeroes. */
-    const ExactType vzero;
-};
-/** Bounded RELU activation object */
-template <typename T, int S>
-struct brelu
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a bounded RELU activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit brelu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{})),
-          valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
-    {
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
-    }
-
-    /** Vector of zeroes. */
-    const ExactType vzero;
-    /** Vector of alphas. */
-    const ExactType valpha;
-};
-/** Lower-Upper Bounded RELU activation object */
-template <typename T, int S>
-struct lubrelu
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a lower-upper bounded RELU activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit lubrelu(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
-          vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
-    {
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
-    }
-
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
-};
-} // namespace detail
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H */
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
deleted file mode 100644
index d756a9a98f..0000000000
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace detail
-{
-inline float32x4x3_t load_matrix_row(const float *ptr)
-{
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
-    return r;
-}
-
-template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
-
-template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
-{
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + 4),
-            vld1q_f32(in_top + 8)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + 4),
-            vld1q_f32(in_mid + 8)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + 4),
-            vld1q_f32(in_low + 8)
-        }
-    };
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vtop.val[0], m0.val[0]),
-            vmulq_f32(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
-{
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
-{
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <unsigned int stridex>
-void store_results(float *buffer, const float32x4x2_t &values);
-
-template <>
-void store_results<1>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-    vst1q_f32(buffer + 4, values.val[1]);
-}
-
-template <>
-void store_results<2>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-}
-
-template <>
-void store_results<3>(float *buffer, const float32x4x2_t &values)
-{
-    vst1_f32(buffer, vget_low_f32(values.val[0]));
-}
-
-template <unsigned int stridex>
-int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
-
-template <>
-int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration;
-}
-
-template <>
-int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration << 1;
-}
-
-template <>
-int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration * 3;
-}
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
-\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
deleted file mode 100644
index d4cbc7f4af..0000000000
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ /dev/null
@@ -1,965 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/utils/misc/Requires.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace detail
-{
-/** Loads a 3x3 matrix as a row  (float).
- *
- * @param[in] ptr            Pointer to a float 3x3 matrix.
- * @param[in] weights_offset (Optional) Weights quantization offset.
- *
- * @return The loaded matrix.
- */
-inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(weights_offset);
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
-    return r;
-}
-
-/** Loads a 3x3 matrix as a row (uint8_t/int8_t).
- *
- * @param[in] ptr            Pointer to a uint8_t/int8_t 3x3 matrix.
- * @param[in] weights_offset (Optional) Weights quantization offset.
- *
- * @return The loaded matrix.
- */
-template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
-{
-    const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
-
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    int32x4x3_t r =
-    {
-        {
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
-        }
-    };
-    return r;
-}
-
-/** Stores a float32x4x2_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(float *buffer, const float32x4x2_t &values);
-
-template <>
-inline void store_results<1>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-    vst1q_f32(buffer + 4, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(float *buffer, const float32x4x2_t &values)
-{
-    vst1_f32(buffer, vget_low_f32(values.val[0]));
-}
-
-/** Stores a uint32_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(int32_t *buffer, const int32x4x2_t &values);
-
-template <>
-inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, values.val[0]);
-    vst1q_s32(buffer + 4, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1_s32(buffer, vget_low_s32(values.val[0]));
-}
-
-template <unsigned int stridex>
-inline void accumulate_results(float *buffer, const float32x4x2_t &values);
-
-template <>
-inline void accumulate_results<1>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
-    vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
-}
-
-template <>
-inline void accumulate_results<2>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
-}
-
-template <>
-inline void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
-{
-    vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
-}
-
-template <unsigned int stridex>
-void accumulate_results(int32_t *buffer, const int32x4x2_t &values);
-
-template <>
-inline void accumulate_results<1>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
-    vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
-}
-
-template <>
-inline void accumulate_results<2>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
-}
-
-template <>
-inline void accumulate_results<3>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Stores a float16x8x2_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(float16_t *buffer, const float16x8x2_t &values);
-
-template <>
-inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, values.val[0]);
-    vst1q_f16(buffer + 8, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1_f16(buffer, vget_low_f16(values.val[0]));
-}
-
-template <unsigned int stridex>
-inline void accumulate_results(float16_t *buffer, const float16x8x2_t &values);
-
-template <>
-inline void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
-    vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));
-}
-
-template <>
-inline void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
-}
-
-template <>
-inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] input_offset (Optional) Input quantization offset.
- *
- */
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                                const size_t dilation_x, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + dilation_x),
-            vld1q_f32(in_top + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + dilation_x),
-            vld1q_f32(in_mid + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + dilation_x),
-            vld1q_f32(in_low + 2 * dilation_x)
-        }
-    };
-    float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
-    out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
-    out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
-
-    out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
-    out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
-    out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
-
-    out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
-    out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
-    out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
-
-    return out;
-}
-
-/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] stridex      Stride value in elements across x.
- * @param[in] input_offset (Optional) Input quantization offset.
- *
- */
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                           const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-    float32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
-    {
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    }
-
-    return out;
-}
-
-/** Perform a convolve3x3 on float32.
- *
- * @param[in]  in_top       Pointer to the first row of the input.
- * @param[in]  in_mid       Pointer to the second row of the input.
- * @param[in]  in_low       Pointer to the third row of the input.
- * @param[out] out_ptr      Pointer to the output.
- * @param[in]  m0           First row of the filter.
- * @param[in]  m1           Second row of the filter.
- * @param[in]  m2           Third row of the filter.
- * @param[in]  stridex      Stride value in elements across x.
- * @param[in]  input_offset (Optional) Input quantization offset.
- *
- */
-template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                  const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                  unsigned int stridex, int input_offset = 0);
-
-template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                         const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                         unsigned int stridex, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-
-    float32x4x2_t out =
-    {
-        {
-            vdupq_n_f32(0.f),
-            vdupq_n_f32(0.f)
-        }
-    };
-    if(stridex == 2)
-    {
-        const float32x4x2_t vtop     = vld2q_f32(in_top);
-        const float32x4x2_t vmid     = vld2q_f32(in_mid);
-        const float32x4x2_t vlow     = vld2q_f32(in_low);
-        const float32x4_t   vtop_end = vld1q_f32(in_top + 8);
-        const float32x4_t   vmid_end = vld1q_f32(in_mid + 8);
-        const float32x4_t   vlow_end = vld1q_f32(in_low + 8);
-
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
-
-        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
-    }
-    else
-    {
-        const float32x4x3_t vtop =
-        {
-            {
-                vld1q_f32(in_top),
-                vld1q_f32(in_top + 4),
-                vld1q_f32(in_top + 8)
-            }
-        };
-        const float32x4x3_t vmid =
-        {
-            {
-                vld1q_f32(in_mid),
-                vld1q_f32(in_mid + 4),
-                vld1q_f32(in_mid + 8)
-            }
-        };
-        const float32x4x3_t vlow =
-        {
-            {
-                vld1q_f32(in_low),
-                vld1q_f32(in_low + 4),
-                vld1q_f32(in_low + 8)
-            }
-        };
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
-
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
-
-        out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
-
-        out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
-
-        if(stridex == 3)
-        {
-            out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-            accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
-        }
-        else
-        {
-            accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
-        }
-    }
-}
-
-/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] input_offset Input quantization offset.
- *
- */
-template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
-                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                              size_t dilation_x, int32_t input_offset)
-{
-    using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
-    using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
-
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + dilation_x),
-            wrapper::vload(in_top + 2 * dilation_x)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + dilation_x),
-            wrapper::vload(in_mid + 2 * dilation_x)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + dilation_x),
-            wrapper::vload(in_low + 2 * dilation_x)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
-        }
-    };
-
-    int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
-    out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
-    out           = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
-
-    out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
-    out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
-    out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
-
-    out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
-    out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
-    out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
-
-    return out;
-}
-
-/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] stridex      Stride value in elements across x.
- * @param[in] input_offset Input quantization offset.
- *
- */
-template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                         const size_t dilation_x, unsigned int stridex, int input_offset)
-{
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-    int32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
-    }
-    return out;
-}
-
-/** Perform a convolve3x3 on 8-bit elements
- *
- * @param[in]  in_top       Pointer to the first row of the input.
- * @param[in]  in_mid       Pointer to the second row of the input.
- * @param[in]  in_low       Pointer to the third row of the input.
- * @param[out] out_ptr      Pointer to the output.
- * @param[in]  m0           First row of the filter.
- * @param[in]  m1           Second row of the filter.
- * @param[in]  m2           Third row of the filter.
- * @param[in]  stridex      Stride value in elements across x.
- * @param[in]  input_offset Input quantization offset.
- *
- */
-template < bool accumulate, typename T1, typename T2, REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
-                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                  unsigned int stridex, int32_t input_offset)
-{
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-    using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
-    using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
-
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + 8)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + 8)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + 8)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-        }
-    };
-
-    int32x4x2_t out
-    {
-        {
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-        }
-    };
-
-    // 0
-    out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
-
-    out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
-
-    out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
-
-    // 1
-    out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
-
-    out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
-
-    out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
-
-    if(stridex == 1)
-    {
-        accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
-    }
-    else if(stridex == 2)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
-
-        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
-        accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Loads a 3x3 matrix as a row (float16_t).
- *
- * @param[in] ptr Pointer to a float 3x3 matrix.
- *
- * @return The loaded matrix.
- */
-inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(weights_offset);
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const float16x8x3_t r =
-    {
-        {
-            vld1q_dup_f16(ptr),
-            vld1q_dup_f16(1 + ptr),
-            vld1q_dup_f16(2 + ptr)
-        }
-    };
-    return r;
-}
-
-/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] input_offset (Optional)Input quantization offset.
- *
- */
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                                const size_t dilation_x, int input_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-    const float16x8x3_t vtop =
-    {
-        {
-            vld1q_f16(in_top),
-            vld1q_f16(in_top + dilation_x),
-            vld1q_f16(in_top + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vmid =
-    {
-        {
-            vld1q_f16(in_mid),
-            vld1q_f16(in_mid + dilation_x),
-            vld1q_f16(in_mid + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vlow =
-    {
-        {
-            vld1q_f16(in_low),
-            vld1q_f16(in_low + dilation_x),
-            vld1q_f16(in_low + 2 * dilation_x)
-        }
-    };
-    float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
-    out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
-    out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
-
-    out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0]));
-    out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1]));
-    out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2]));
-
-    out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0]));
-    out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1]));
-    out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2]));
-
-    return out;
-}
-
-/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] stridex      Stride value in elements across x.
- * @param[in] input_offset (Optional) Input quantization offset.
- *
- */
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                           const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
-    float16x8x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
-    {
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
-    }
-
-    return out;
-}
-
-/** Perform a convolve3x3 on float16.
- *
- * @param[in]  in_top       Pointer to the first row of the input.
- * @param[in]  in_mid       Pointer to the second row of the input.
- * @param[in]  in_low       Pointer to the third row of the input.
- * @param[out] out_ptr      Pointer to the output.
- * @param[in]  m0           First row of the filter.
- * @param[in]  m1           Second row of the filter.
- * @param[in]  m2           Third row of the filter.
- * @param[in]  stridex      Stride value in elements across x.
- * @param[in]  input_offset (Optional) Input quantization offset.
- *
- */
-template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
-                         const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                         unsigned int stridex, int input_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    float16x8x2_t out =
-    {
-        {
-            vdupq_n_f16(0),
-            vdupq_n_f16(0)
-        }
-    };
-    if(stridex == 2)
-    {
-        const float16x8x2_t vtop     = vld2q_f16(in_top);
-        const float16x8x2_t vmid     = vld2q_f16(in_mid);
-        const float16x8x2_t vlow     = vld2q_f16(in_low);
-        const float16x8_t   vtop_end = vld1q_f16(in_top + 16);
-        const float16x8_t   vmid_end = vld1q_f16(in_mid + 16);
-        const float16x8_t   vlow_end = vld1q_f16(in_low + 16);
-
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vtop.val[1], m0.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop_end, 1), m0.val[2]));
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[1], m1.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid_end, 1), m1.val[2]));
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[1], m2.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow_end, 1), m2.val[2]));
-
-        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
-    }
-    else
-    {
-        const float16x8x3_t vtop =
-        {
-            {
-                vld1q_f16(in_top),
-                vld1q_f16(in_top + 8),
-                vld1q_f16(in_top + 16)
-            }
-        };
-        const float16x8x3_t vmid =
-        {
-            {
-                vld1q_f16(in_mid),
-                vld1q_f16(in_mid + 8),
-                vld1q_f16(in_mid + 16)
-            }
-        };
-        const float16x8x3_t vlow =
-        {
-            {
-                vld1q_f16(in_low),
-                vld1q_f16(in_low + 8),
-                vld1q_f16(in_low + 16)
-            }
-        };
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
-
-        if(stridex == 3)
-        {
-            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
-            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
-            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
-
-            accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
-        }
-        else
-        {
-            accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
-        }
-    }
-}
-#endif /** __ARM_FEATURE_FP16_VECTOR_ARITHMETIC **/
-
-/** Get the number of elements processed on 3x3 convolution.
- *
- * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution.
- * @param[in] stridex                         Stride value in elements across x.
- *
- * @return The number of elements processed.
- */
-inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
-{
-    switch(stridex)
-    {
-        case 1:
-            return num_elems_written_per_iteration;
-        case 2:
-            return num_elems_written_per_iteration << 1;
-        case 3:
-            return num_elems_written_per_iteration * 3;
-        default:
-            ARM_COMPUTE_ERROR("stridex not supported");
-            return 0;
-    }
-}
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/abs.h b/arm_compute/core/NEON/wrapper/intrinsics/abs.h
deleted file mode 100644
index aff18166f5..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/abs.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ABS_H
-#define ARM_COMPUTE_WRAPPER_ABS_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VABS_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vabs(const vtype &a)            \
-    {                                            \
-        return prefix##_##postfix(a);            \
-    }
-
-#define VQABS_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vqabs(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-// Absolute: vabs{q}_<type>. Vd[i] = |Va[i]|
-VABS_IMPL(int8x8_t, int8x8_t, vabs, s8)
-VABS_IMPL(int16x4_t, int16x4_t, vabs, s16)
-VABS_IMPL(int32x2_t, int32x2_t, vabs, s32)
-VABS_IMPL(float32x2_t, float32x2_t, vabs, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VABS_IMPL(float16x4_t, float16x4_t, vabs, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VABS_IMPL(int8x16_t, int8x16_t, vabsq, s8)
-VABS_IMPL(int16x8_t, int16x8_t, vabsq, s16)
-VABS_IMPL(int32x4_t, int32x4_t, vabsq, s32)
-VABS_IMPL(float32x4_t, float32x4_t, vabsq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VABS_IMPL(float16x8_t, float16x8_t, vabsq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-// Saturating absolute: vqabs{q}_<type>. Vd[i] = sat(|Va[i]|)
-VQABS_IMPL(int8x8_t, int8x8_t, vqabs, s8)
-VQABS_IMPL(int16x4_t, int16x4_t, vqabs, s16)
-VQABS_IMPL(int32x2_t, int32x2_t, vqabs, s32)
-
-VQABS_IMPL(int8x16_t, int8x16_t, vqabsq, s8)
-VQABS_IMPL(int16x8_t, int16x8_t, vqabsq, s16)
-VQABS_IMPL(int32x4_t, int32x4_t, vqabsq, s32)
-
-#undef VABS_IMPL
-#undef VQABS_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ABS_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h
deleted file mode 100644
index 776e136a56..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/add.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ADD_H
-#define ARM_COMPUTE_WRAPPER_ADD_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VADD_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vadd(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VADD_IMPL(uint8x8_t, uint8x8_t, vadd, u8)
-VADD_IMPL(int8x8_t, int8x8_t, vadd, s8)
-VADD_IMPL(uint16x4_t, uint16x4_t, vadd, u16)
-VADD_IMPL(int16x4_t, int16x4_t, vadd, s16)
-VADD_IMPL(uint32x2_t, uint32x2_t, vadd, u32)
-VADD_IMPL(int32x2_t, int32x2_t, vadd, s32)
-VADD_IMPL(uint64x1_t, uint64x1_t, vadd, u64)
-VADD_IMPL(int64x1_t, int64x1_t, vadd, s64)
-VADD_IMPL(float32x2_t, float32x2_t, vadd, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VADD_IMPL(float16x4_t, float16x4_t, vadd, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VADD_IMPL(uint8x16_t, uint8x16_t, vaddq, u8)
-VADD_IMPL(int8x16_t, int8x16_t, vaddq, s8)
-VADD_IMPL(uint16x8_t, uint16x8_t, vaddq, u16)
-VADD_IMPL(int16x8_t, int16x8_t, vaddq, s16)
-VADD_IMPL(uint32x4_t, uint32x4_t, vaddq, u32)
-VADD_IMPL(int32x4_t, int32x4_t, vaddq, s32)
-VADD_IMPL(uint64x2_t, uint64x2_t, vaddq, u64)
-VADD_IMPL(int64x2_t, int64x2_t, vaddq, s64)
-VADD_IMPL(float32x4_t, float32x4_t, vaddq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VADD_IMPL
-
-// VQADD: Vector saturating add (No notion of saturation for floating point)
-#define VQADD_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vqadd(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8)
-VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8)
-VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16)
-VQADD_IMPL(int16x4_t, int16x4_t, vqadd, s16)
-VQADD_IMPL(uint32x2_t, uint32x2_t, vqadd, u32)
-VQADD_IMPL(int32x2_t, int32x2_t, vqadd, s32)
-VQADD_IMPL(uint64x1_t, uint64x1_t, vqadd, u64)
-VQADD_IMPL(int64x1_t, int64x1_t, vqadd, s64)
-VQADD_IMPL(float32x2_t, float32x2_t, vadd, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQADD_IMPL(float16x4_t, float16x4_t, vadd, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VQADD_IMPL(uint8x16_t, uint8x16_t, vqaddq, u8)
-VQADD_IMPL(int8x16_t, int8x16_t, vqaddq, s8)
-VQADD_IMPL(uint16x8_t, uint16x8_t, vqaddq, u16)
-VQADD_IMPL(int16x8_t, int16x8_t, vqaddq, s16)
-VQADD_IMPL(uint32x4_t, uint32x4_t, vqaddq, u32)
-VQADD_IMPL(int32x4_t, int32x4_t, vqaddq, s32)
-VQADD_IMPL(uint64x2_t, uint64x2_t, vqaddq, u64)
-VQADD_IMPL(int64x2_t, int64x2_t, vqaddq, s64)
-VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VQADD_IMPL
-
-// VADDW: Vector widening add
-#define VADDW_IMPL(wtype, vtype, prefix, postfix)      \
-    inline wtype vaddw(const wtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8)
-VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8)
-VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16)
-VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16)
-VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32)
-VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32)
-#undef VADDW_IMPL
-
-// VADDL: Vector long add
-#define VADDL_IMPL(wtype, vtype, prefix, postfix)      \
-    inline wtype vaddl(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8)
-VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8)
-VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16)
-VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16)
-VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32)
-VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32)
-#undef VADDL_IMPL
-
-#if defined(__aarch64__)
-// VADDV: Across vector add
-#define VADDV_IMPL(stype, vtype, prefix, postfix) \
-    inline stype vaddv(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8)
-VADDV_IMPL(int8_t, int8x8_t, vaddv, s8)
-VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16)
-VADDV_IMPL(int16_t, int16x4_t, vaddv, s16)
-VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32)
-VADDV_IMPL(int32_t, int32x2_t, vaddv, s32)
-VADDV_IMPL(float, float32x2_t, vaddv, f32)
-
-VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8)
-VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8)
-VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16)
-VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16)
-VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32)
-VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32)
-VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64)
-VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64)
-VADDV_IMPL(float, float32x4_t, vaddvq, f32)
-#undef VADDV_IMPL
-#endif // defined(__aarch64__)
-
-// VPADDL: Signed add long pairwise
-#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \
-    inline ltype vpaddl(const vtype &a)            \
-    {                                              \
-        return prefix##_##postfix(a);              \
-    }
-
-VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8)
-VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8)
-VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16)
-VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16)
-VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32)
-VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32)
-
-VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8)
-VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8)
-VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16)
-VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16)
-VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32)
-VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32)
-#undef VPADDL_IMPL
-
-// VPADD: Add pairwise
-#define VPADD_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vpadd(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VPADD_IMPL(uint8x8_t, uint8x8_t, vpadd, u8)
-VPADD_IMPL(int8x8_t, int8x8_t, vpadd, s8)
-VPADD_IMPL(uint16x4_t, uint16x4_t, vpadd, u16)
-VPADD_IMPL(int16x4_t, int16x4_t, vpadd, s16)
-VPADD_IMPL(uint32x2_t, uint32x2_t, vpadd, u32)
-VPADD_IMPL(int32x2_t, int32x2_t, vpadd, s32)
-VPADD_IMPL(float32x2_t, float32x2_t, vpadd, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPADD_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ADD_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/and.h b/arm_compute/core/NEON/wrapper/intrinsics/and.h
deleted file mode 100644
index 1973c5593d..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/and.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_AND_H
-#define ARM_COMPUTE_WRAPPER_AND_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VAND_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vand(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VAND_IMPL(uint8_t, uint8x8_t, vand, u8)
-VAND_IMPL(int8_t, int8x8_t, vand, s8)
-VAND_IMPL(uint16_t, uint16x4_t, vand, u16)
-VAND_IMPL(int16_t, int16x4_t, vand, s16)
-VAND_IMPL(uint32_t, uint32x2_t, vand, u32)
-VAND_IMPL(int32_t, int32x2_t, vand, s32)
-VAND_IMPL(uint64_t, uint64x1_t, vand, u64)
-VAND_IMPL(int64_t, int64x1_t, vand, s64)
-
-VAND_IMPL(uint8_t, uint8x16_t, vandq, u8)
-VAND_IMPL(int8_t, int8x16_t, vandq, s8)
-VAND_IMPL(uint16_t, uint16x8_t, vandq, u16)
-VAND_IMPL(int16_t, int16x8_t, vandq, s16)
-VAND_IMPL(uint32_t, uint32x4_t, vandq, u32)
-VAND_IMPL(int32_t, int32x4_t, vandq, s32)
-VAND_IMPL(uint64_t, uint64x2_t, vandq, u64)
-VAND_IMPL(int64_t, int64x2_t, vandq, s64)
-
-#undef VAND_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_AND_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
deleted file mode 100644
index 3c26a9c786..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_BSL_H
-#define ARM_COMPUTE_WRAPPER_BSL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VBSL_IMPL(stype, vtype, ctype, prefix, postfix)               \
-    inline vtype vbsl(const ctype &a, const vtype &b, const vtype &c) \
-    {                                                                 \
-        return prefix##_##postfix(a, b, c);                           \
-    }
-
-VBSL_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8)
-VBSL_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8)
-VBSL_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16)
-VBSL_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16)
-VBSL_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32)
-VBSL_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32)
-VBSL_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBSL_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VBSL_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8)
-VBSL_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8)
-VBSL_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16)
-VBSL_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16)
-VBSL_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32)
-VBSL_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32)
-VBSL_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBSL_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VBSL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_BSL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
deleted file mode 100644
index f8a8f91f73..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CEQ_H
-#define ARM_COMPUTE_WRAPPER_CEQ_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCEQ_IMPL(votype, vtype, prefix, postfix)      \
-    inline votype vceq(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8)
-VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8)
-VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16)
-VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16)
-VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32)
-VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32)
-VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8)
-VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8)
-VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16)
-VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16)
-VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32)
-VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32)
-VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCEQ_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CEQ_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cge.h b/arm_compute/core/NEON/wrapper/intrinsics/cge.h
deleted file mode 100644
index bf231b8b46..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/cge.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CGE_H
-#define ARM_COMPUTE_WRAPPER_CGE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCGE_IMPL(stype, vtype, rtype, prefix, postfix) \
-    inline rtype vcge(const vtype &a, const vtype &b)   \
-    {                                                   \
-        return prefix##_##postfix(a, b);                \
-    }
-
-VCGE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcge, u8)
-VCGE_IMPL(int8_t, int8x8_t, uint8x8_t, vcge, s8)
-VCGE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcge, u16)
-VCGE_IMPL(int16_t, int16x4_t, uint16x4_t, vcge, s16)
-VCGE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcge, u32)
-VCGE_IMPL(int32_t, int32x2_t, uint32x2_t, vcge, s32)
-VCGE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcge, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcge, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCGE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgeq, u8)
-VCGE_IMPL(int8_t, int8x16_t, uint8x16_t, vcgeq, s8)
-VCGE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgeq, u16)
-VCGE_IMPL(int16_t, int16x8_t, uint16x8_t, vcgeq, s16)
-VCGE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgeq, u32)
-VCGE_IMPL(int32_t, int32x4_t, uint32x4_t, vcgeq, s32)
-VCGE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgeq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgeq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCGE_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CGE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
deleted file mode 100644
index 5202a5b21d..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CGT_H
-#define ARM_COMPUTE_WRAPPER_CGT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCGT_IMPL(rtype, vtype, prefix, postfix)      \
-    inline rtype vcgt(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8)
-VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8)
-VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16)
-VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16)
-VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32)
-VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32)
-VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8)
-VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8)
-VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16)
-VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16)
-VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32)
-VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32)
-VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCGT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CGT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/arm_compute/core/NEON/wrapper/intrinsics/clt.h
deleted file mode 100644
index 4701ab7026..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/clt.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CLT_H
-#define ARM_COMPUTE_WRAPPER_CLT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCLT_IMPL(votype, vtype, prefix, postfix)      \
-    inline votype vclt(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8)
-VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8)
-VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16)
-VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16)
-VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32)
-VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32)
-VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8)
-VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8)
-VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16)
-VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16)
-VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32)
-VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32)
-VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCLT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CLT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/combine.h b/arm_compute/core/NEON/wrapper/intrinsics/combine.h
deleted file mode 100644
index 9099e28fc4..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/combine.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_COMBINE_H
-#define ARM_COMPUTE_WRAPPER_COMBINE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCOMBINE_IMPL(rtype, vtype, prefix, postfix)      \
-    inline rtype vcombine(const vtype &a, const vtype &b) \
-    {                                                     \
-        return prefix##_##postfix(a, b);                  \
-    }
-
-VCOMBINE_IMPL(uint8x16_t, uint8x8_t, vcombine, u8)
-VCOMBINE_IMPL(int8x16_t, int8x8_t, vcombine, s8)
-VCOMBINE_IMPL(uint16x8_t, uint16x4_t, vcombine, u16)
-VCOMBINE_IMPL(int16x8_t, int16x4_t, vcombine, s16)
-VCOMBINE_IMPL(uint32x4_t, uint32x2_t, vcombine, u32)
-VCOMBINE_IMPL(int32x4_t, int32x2_t, vcombine, s32)
-VCOMBINE_IMPL(float32x4_t, float32x2_t, vcombine, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCOMBINE_IMPL(float16x8_t, float16x4_t, vcombine, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCOMBINE_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_COMBINE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
deleted file mode 100644
index 5ea9a5dedd..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CVT_H
-#define ARM_COMPUTE_WRAPPER_CVT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                   \
-    template <typename T>                                                            \
-    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
-    vcvt(const vtype &a)                                                             \
-    {                                                                                \
-        return prefix##_##postfix1##_##postfix2(a);                                  \
-    }
-
-VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
-VCVT_TO_F32_IMPL(float32x4_t, int32x4_t, vcvtq, f32, s32)
-#undef VCVT_TO_F32_IMPL
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint32x4_t>::type
-vcvt(const float32x4_t &a)
-{
-    return vcvtq_u32_f32(a);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int32x4_t>::type
-vcvt(const float32x4_t &a)
-{
-    return vcvtq_s32_f32(a);
-}
-
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-/** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector
- *
- * @param[in]     inptr  Pointer to the input memory to load values from
- * @param[in,out] outptr Pointer to the output memory to store values to
- */
-inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
-{
-    __asm __volatile(
-        "ldp    q0, q1, [%[inptr]]\n"
-        ".inst  0xea16800\n"  // BFCVTN v0, v0
-        ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
-        "str    q0, [%[outptr]]\n"
-        : [inptr] "+r"(inptr)
-        : [outptr] "r"(outptr)
-        : "v0", "v1", "memory");
-}
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CVT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/div.h b/arm_compute/core/NEON/wrapper/intrinsics/div.h
deleted file mode 100644
index d49a9113b0..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/div.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_DIV_H
-#define ARM_COMPUTE_WRAPPER_DIV_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#ifdef __aarch64__
-
-#define VDIV_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vdiv(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-VDIV_IMPL(float32x2_t, float32x2_t, vdiv, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x4_t, float16x4_t, vdiv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VDIV_IMPL(float32x4_t, float32x4_t, vdivq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x8_t, float16x8_t, vdivq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#else // __aarch64__
-
-#define VDIV_IMPL(stype, vtype, mul_prefix, inv_prefix, postfix)     \
-    inline vtype vdiv(const vtype &a, const vtype &b)                \
-    {                                                                \
-        return mul_prefix##_##postfix(a, inv_prefix##_##postfix(b)); \
-    }
-VDIV_IMPL(float32x2_t, float32x2_t, vmul, vinv, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x4_t, float16x4_t, vmul, vinv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VDIV_IMPL(float32x4_t, float32x4_t, vmulq, vinvq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x8_t, float16x8_t, vmulq, vinvq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#endif // __aarch64__
-
-#undef VDIV_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_DIV_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h b/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
deleted file mode 100644
index ffbfde72c5..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_DUP_N_H
-#define ARM_COMPUTE_WRAPPER_DUP_N_H
-
-#include "arm_compute/core/NEON/wrapper/traits.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VDUP_N_IMPL(stype, vtype, prefix, postfix, tag) \
-    inline vtype vdup_n(stype value, tag)               \
-    {                                                   \
-        return prefix##_##postfix(value);               \
-    }
-
-VDUP_N_IMPL(uint8_t, uint8x8_t, vdup_n, u8, traits::vector_64_tag)
-VDUP_N_IMPL(int8_t, int8x8_t, vdup_n, s8, traits::vector_64_tag)
-VDUP_N_IMPL(uint16_t, uint16x4_t, vdup_n, u16, traits::vector_64_tag)
-VDUP_N_IMPL(int16_t, int16x4_t, vdup_n, s16, traits::vector_64_tag)
-VDUP_N_IMPL(uint32_t, uint32x2_t, vdup_n, u32, traits::vector_64_tag)
-VDUP_N_IMPL(int32_t, int32x2_t, vdup_n, s32, traits::vector_64_tag)
-VDUP_N_IMPL(float, float32x2_t, vdup_n, f32, traits::vector_64_tag)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDUP_N_IMPL(float16_t, float16x4_t, vdup_n, f16, traits::vector_64_tag)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VDUP_N_IMPL(uint8_t, uint8x16_t, vdupq_n, u8, traits::vector_128_tag)
-VDUP_N_IMPL(int8_t, int8x16_t, vdupq_n, s8, traits::vector_128_tag)
-VDUP_N_IMPL(uint16_t, uint16x8_t, vdupq_n, u16, traits::vector_128_tag)
-VDUP_N_IMPL(int16_t, int16x8_t, vdupq_n, s16, traits::vector_128_tag)
-VDUP_N_IMPL(uint32_t, uint32x4_t, vdupq_n, u32, traits::vector_128_tag)
-VDUP_N_IMPL(int32_t, int32x4_t, vdupq_n, s32, traits::vector_128_tag)
-VDUP_N_IMPL(float, float32x4_t, vdupq_n, f32, traits::vector_128_tag)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDUP_N_IMPL(float16_t, float16x8_t, vdupq_n, f16, traits::vector_128_tag)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VDUP_N_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_DUP_N_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/arm_compute/core/NEON/wrapper/intrinsics/eor.h
deleted file mode 100644
index a0e7b681ab..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/eor.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_EOR_H
-#define ARM_COMPUTE_WRAPPER_EOR_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VEOR_IMPL(vtype, prefix, postfix)             \
-    inline vtype veor(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VEOR_IMPL(uint8x8_t, veor, u8)
-VEOR_IMPL(int8x8_t, veor, s8)
-VEOR_IMPL(uint16x4_t, veor, u16)
-VEOR_IMPL(int16x4_t, veor, s16)
-VEOR_IMPL(uint32x2_t, veor, u32)
-VEOR_IMPL(int32x2_t, veor, s32)
-
-VEOR_IMPL(uint8x16_t, veorq, u8)
-VEOR_IMPL(int8x16_t, veorq, s8)
-VEOR_IMPL(uint16x8_t, veorq, u16)
-VEOR_IMPL(int16x8_t, veorq, s16)
-VEOR_IMPL(uint32x4_t, veorq, u32)
-VEOR_IMPL(int32x4_t, veorq, s32)
-
-#undef VEOR_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_EOR_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/exp.h b/arm_compute/core/NEON/wrapper/intrinsics/exp.h
deleted file mode 100644
index 4b17ebd93f..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/exp.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_EXP_H
-#define ARM_COMPUTE_WRAPPER_EXP_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VEXPQ_IMPL(vtype, postfix)     \
-    inline vtype vexpq(const vtype &a) \
-    {                                  \
-        return vexpq_##postfix(a);     \
-    }
-
-#define VEXPQ_IMPL_INT(vtype, postfix)      \
-    inline vtype vexpq(const vtype &a)      \
-    {                                       \
-        ARM_COMPUTE_UNUSED(a);              \
-        ARM_COMPUTE_ERROR("Not supported"); \
-    }
-
-VEXPQ_IMPL(float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VEXPQ_IMPL(float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VEXPQ_IMPL_INT(int32x4_t, s32)
-#undef VEXPQ_IMPL
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_EXP_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ext.h b/arm_compute/core/NEON/wrapper/intrinsics/ext.h
deleted file mode 100644
index f2c3dcc901..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/ext.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_EXT_H
-#define ARM_COMPUTE_WRAPPER_EXT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VEXT_IMPL(vtype, prefix, postfix, size)            \
-    inline vtype vext_##size(vtype value_a, vtype value_b) \
-    {                                                      \
-        return prefix##_##postfix(value_a, value_b, size); \
-    }
-
-VEXT_IMPL(uint8x8_t, vext, u8, 1)
-VEXT_IMPL(uint8x8_t, vext, u8, 2)
-VEXT_IMPL(int8x8_t, vext, s8, 1)
-VEXT_IMPL(int8x8_t, vext, s8, 2)
-VEXT_IMPL(uint16x4_t, vext, u16, 1)
-VEXT_IMPL(uint16x4_t, vext, u16, 2)
-VEXT_IMPL(int16x4_t, vext, s16, 1)
-VEXT_IMPL(int16x4_t, vext, s16, 2)
-
-VEXT_IMPL(uint8x16_t, vextq, u8, 1)
-VEXT_IMPL(uint8x16_t, vextq, u8, 2)
-VEXT_IMPL(int8x16_t, vextq, s8, 1)
-VEXT_IMPL(int8x16_t, vextq, s8, 2)
-VEXT_IMPL(uint16x8_t, vextq, u16, 1)
-VEXT_IMPL(uint16x8_t, vextq, u16, 2)
-VEXT_IMPL(int16x8_t, vextq, s16, 1)
-VEXT_IMPL(int16x8_t, vextq, s16, 2)
-VEXT_IMPL(int32x4_t, vextq, s32, 1)
-VEXT_IMPL(int32x4_t, vextq, s32, 2)
-
-#undef VEXT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_EXT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
deleted file mode 100644
index 13d29677a6..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_GET_HIGH_H
-#define ARM_COMPUTE_WRAPPER_GET_HIGH_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VGETHIGH_IMPL(half_vtype, vtype, postfix) \
-    inline half_vtype vgethigh(const vtype val)   \
-    {                                             \
-        return vget_high_##postfix(val);          \
-    }
-
-VGETHIGH_IMPL(uint8x8_t, uint8x16_t, u8)
-VGETHIGH_IMPL(int8x8_t, int8x16_t, s8)
-VGETHIGH_IMPL(uint16x4_t, uint16x8_t, u16)
-VGETHIGH_IMPL(int16x4_t, int16x8_t, s16)
-VGETHIGH_IMPL(uint32x2_t, uint32x4_t, u32)
-VGETHIGH_IMPL(int32x2_t, int32x4_t, s32)
-VGETHIGH_IMPL(float32x2_t, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETHIGH_IMPL(float16x4_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VGETHIGH_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_GET_HIGH_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
deleted file mode 100644
index 533bf63603..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_GET_LANE_H
-#define ARM_COMPUTE_WRAPPER_GET_LANE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VGETLANE_IMPL_8(stype, vtype, postfix)                         \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vget_lane_##postfix(vector, 0);                 \
-            case 1:                                                    \
-                return vget_lane_##postfix(vector, 1);                 \
-            case 2:                                                    \
-                return vget_lane_##postfix(vector, 2);                 \
-            case 3:                                                    \
-                return vget_lane_##postfix(vector, 3);                 \
-            case 4:                                                    \
-                return vget_lane_##postfix(vector, 4);                 \
-            case 5:                                                    \
-                return vget_lane_##postfix(vector, 5);                 \
-            case 6:                                                    \
-                return vget_lane_##postfix(vector, 6);                 \
-            case 7:                                                    \
-                return vget_lane_##postfix(vector, 7);                 \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETLANE_IMPL_4(stype, vtype, postfix)                         \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vget_lane_##postfix(vector, 0);                 \
-            case 1:                                                    \
-                return vget_lane_##postfix(vector, 1);                 \
-            case 2:                                                    \
-                return vget_lane_##postfix(vector, 2);                 \
-            case 3:                                                    \
-                return vget_lane_##postfix(vector, 3);                 \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETLANE_IMPL_2(stype, vtype, postfix)                         \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vget_lane_##postfix(vector, 0);                 \
-            case 1:                                                    \
-                return vget_lane_##postfix(vector, 1);                 \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-VGETLANE_IMPL_8(uint8_t, uint8x8_t, u8)
-VGETLANE_IMPL_8(int8_t, int8x8_t, s8)
-VGETLANE_IMPL_4(uint16_t, uint16x4_t, u16)
-VGETLANE_IMPL_4(int16_t, int16x4_t, s16)
-VGETLANE_IMPL_2(uint32_t, uint32x2_t, u32)
-VGETLANE_IMPL_2(int32_t, int32x2_t, s32)
-VGETLANE_IMPL_2(float, float32x2_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#define VGETQLANE_IMPL_16(stype, vtype, postfix)                       \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            case 2:                                                    \
-                return vgetq_lane_##postfix(vector, 2);                \
-            case 3:                                                    \
-                return vgetq_lane_##postfix(vector, 3);                \
-            case 4:                                                    \
-                return vgetq_lane_##postfix(vector, 4);                \
-            case 5:                                                    \
-                return vgetq_lane_##postfix(vector, 5);                \
-            case 6:                                                    \
-                return vgetq_lane_##postfix(vector, 6);                \
-            case 7:                                                    \
-                return vgetq_lane_##postfix(vector, 7);                \
-            case 8:                                                    \
-                return vgetq_lane_##postfix(vector, 8);                \
-            case 9:                                                    \
-                return vgetq_lane_##postfix(vector, 9);                \
-            case 10:                                                   \
-                return vgetq_lane_##postfix(vector, 10);               \
-            case 11:                                                   \
-                return vgetq_lane_##postfix(vector, 11);               \
-            case 12:                                                   \
-                return vgetq_lane_##postfix(vector, 12);               \
-            case 13:                                                   \
-                return vgetq_lane_##postfix(vector, 13);               \
-            case 14:                                                   \
-                return vgetq_lane_##postfix(vector, 14);               \
-            case 15:                                                   \
-                return vgetq_lane_##postfix(vector, 15);               \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETQLANE_IMPL_8(stype, vtype, postfix)                        \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            case 2:                                                    \
-                return vgetq_lane_##postfix(vector, 2);                \
-            case 3:                                                    \
-                return vgetq_lane_##postfix(vector, 3);                \
-            case 4:                                                    \
-                return vgetq_lane_##postfix(vector, 4);                \
-            case 5:                                                    \
-                return vgetq_lane_##postfix(vector, 5);                \
-            case 6:                                                    \
-                return vgetq_lane_##postfix(vector, 6);                \
-            case 7:                                                    \
-                return vgetq_lane_##postfix(vector, 7);                \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETQLANE_IMPL_4(stype, vtype, postfix)                        \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            case 2:                                                    \
-                return vgetq_lane_##postfix(vector, 2);                \
-            case 3:                                                    \
-                return vgetq_lane_##postfix(vector, 3);                \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETQLANE_IMPL_2(stype, vtype, postfix)                        \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-VGETQLANE_IMPL_16(uint8_t, uint8x16_t, u8)
-VGETQLANE_IMPL_16(int8_t, int8x16_t, s8)
-VGETQLANE_IMPL_8(uint16_t, uint16x8_t, u16)
-VGETQLANE_IMPL_8(int16_t, int16x8_t, s16)
-VGETQLANE_IMPL_4(uint32_t, uint32x4_t, u32)
-VGETQLANE_IMPL_4(int32_t, int32x4_t, s32)
-VGETQLANE_IMPL_4(float, float32x4_t, f32)
-VGETQLANE_IMPL_2(int64_t, int64x2_t, s64)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETQLANE_IMPL_8(float16_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VGETLANE_IMPL_8
-#undef VGETLANE_IMPL_4
-#undef VGETLANE_IMPL_2
-
-#undef VGETQLANE_IMPL_16
-#undef VGETQLANE_IMPL_8
-#undef VGETQLANE_IMPL_4
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_GET_LANE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
deleted file mode 100644
index dbc3d869e1..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_GET_LOW_H
-#define ARM_COMPUTE_WRAPPER_GET_LOW_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VGETLOW_IMPL(half_vtype, vtype, postfix) \
-    inline half_vtype vgetlow(const vtype val)   \
-    {                                            \
-        return vget_low_##postfix(val);          \
-    }
-
-VGETLOW_IMPL(uint8x8_t, uint8x16_t, u8)
-VGETLOW_IMPL(int8x8_t, int8x16_t, s8)
-VGETLOW_IMPL(uint16x4_t, uint16x8_t, u16)
-VGETLOW_IMPL(int16x4_t, int16x8_t, s16)
-VGETLOW_IMPL(uint32x2_t, uint32x4_t, u32)
-VGETLOW_IMPL(int32x2_t, int32x4_t, s32)
-VGETLOW_IMPL(float32x2_t, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETLOW_IMPL(float16x4_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VGETLOW_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_GET_LOW_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
deleted file mode 100644
index 1150daa073..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_INTRINSICS_H
-#define ARM_COMPUTE_WRAPPER_INTRINSICS_H
-
-#include "arm_compute/core/NEON/wrapper/intrinsics/abs.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/add.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/and.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cge.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/clt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cvt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/div.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/eor.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/ext.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/log.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/max.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/min.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/mla.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/movl.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/movn.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/mul.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/neg.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/not.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/orr.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/qmov.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/qmovun.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/round.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/sin.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/sub.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/tanh.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/tbl.h"
-
-#endif /* ARM_COMPUTE_WRAPPER_INTRINSICS_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/inv.h b/arm_compute/core/NEON/wrapper/intrinsics/inv.h
deleted file mode 100644
index 9da66baffa..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/inv.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_INV_H
-#define ARM_COMPUTE_WRAPPER_INV_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VINV_IMPL(vtype, prefix, postfix) \
-    inline vtype vinv(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-#define VINV_IMPL_INT(vtype, prefix, postfix) \
-    inline vtype vinv(const vtype &a)         \
-    {                                         \
-        ARM_COMPUTE_UNUSED(a);                \
-        ARM_COMPUTE_ERROR("Not supported");   \
-    }
-
-VINV_IMPL(float32x2_t, vinv, f32)
-VINV_IMPL_INT(int32x2_t, vinv, s32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINV_IMPL(float16x4_t, vinv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VINV_IMPL(float32x4_t, vinvq, f32)
-VINV_IMPL_INT(int32x4_t, vinvq, s32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINV_IMPL(float16x8_t, vinvq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VINV_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_INV_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h b/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
deleted file mode 100644
index 77adcf7b8c..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_INVSQRT_H
-#define ARM_COMPUTE_WRAPPER_INVSQRT_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VINVSQRT_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vinvsqrt(const vtype &a)            \
-    {                                                \
-        return prefix##_##postfix(a);                \
-    }
-
-#define VINVSQRT_IMPL_INT(stype, vtype, prefix, postfix) \
-    inline vtype vinvsqrt(const vtype &a)                \
-    {                                                    \
-        ARM_COMPUTE_UNUSED(a);                           \
-        ARM_COMPUTE_ERROR("Not supported");              \
-    }
-
-VINVSQRT_IMPL(float, float32x2_t, vinvsqrt, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINVSQRT_IMPL(float16_t, float16x4_t, vinvsqrt, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINVSQRT_IMPL_INT(int, int32x4_t, vinvsqrt, s32)
-
-VINVSQRT_IMPL(float, float32x4_t, vinvsqrtq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINVSQRT_IMPL(float16_t, float16x8_t, vinvsqrtq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VINVSQRT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_INVSQRT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/arm_compute/core/NEON/wrapper/intrinsics/load.h
deleted file mode 100644
index d38350f05b..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/load.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_LOAD_H
-#define ARM_COMPUTE_WRAPPER_LOAD_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VLOAD_IMPL(stype, vtype, postfix) \
-    inline vtype vload(const stype *ptr)  \
-    {                                     \
-        return vld1_##postfix(ptr);       \
-    }
-
-VLOAD_IMPL(uint8_t, uint8x8_t, u8)
-VLOAD_IMPL(int8_t, int8x8_t, s8)
-VLOAD_IMPL(uint16_t, uint16x4_t, u16)
-VLOAD_IMPL(int16_t, int16x4_t, s16)
-VLOAD_IMPL(uint32_t, uint32x2_t, u32)
-VLOAD_IMPL(int32_t, int32x2_t, s32)
-//VLOAD_IMPL(uint64_t, uint64x1_t, u64)
-//VLOAD_IMPL(int64_t, int64x1_t, s64)
-VLOAD_IMPL(float, float32x2_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOAD_IMPL(float16_t, float16x4_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#define VLOADQ_IMPL(stype, vtype, postfix) \
-    inline vtype vloadq(const stype *ptr)  \
-    {                                      \
-        return vld1q_##postfix(ptr);       \
-    }
-
-VLOADQ_IMPL(uint8_t, uint8x16_t, u8)
-VLOADQ_IMPL(int8_t, int8x16_t, s8)
-VLOADQ_IMPL(uint16_t, uint16x8_t, u16)
-VLOADQ_IMPL(int16_t, int16x8_t, s16)
-VLOADQ_IMPL(uint32_t, uint32x4_t, u32)
-VLOADQ_IMPL(int32_t, int32x4_t, s32)
-//VLOAD_IMPL(uint64_t, uint64x1_t, u64)
-//VLOAD_IMPL(int64_t, int64x1_t, s64)
-VLOADQ_IMPL(float, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOADQ_IMPL(float16_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VLOAD_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_LOAD_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/log.h b/arm_compute/core/NEON/wrapper/intrinsics/log.h
deleted file mode 100644
index 682830c122..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/log.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_LOG_H
-#define ARM_COMPUTE_WRAPPER_LOG_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VLOG_IMPL(vtype, prefix, postfix) \
-    inline vtype vlog(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-#define VLOG_IMPL_INT(vtype, prefix, postfix) \
-    inline vtype vlog(const vtype &a)         \
-    {                                         \
-        ARM_COMPUTE_UNUSED(a);                \
-        ARM_COMPUTE_ERROR("Not supported");   \
-    }
-
-VLOG_IMPL(float32x4_t, vlogq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOG_IMPL(float16x8_t, vlogq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOG_IMPL_INT(int32x4_t, vlogq, s32)
-
-#undef VLOG_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_LOG_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/max.h b/arm_compute/core/NEON/wrapper/intrinsics/max.h
deleted file mode 100644
index a87b7a32b5..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/max.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMAX_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vmax(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VMAX_IMPL(uint8_t, uint8x8_t, vmax, u8)
-VMAX_IMPL(int8_t, int8x8_t, vmax, s8)
-VMAX_IMPL(uint16_t, uint16x4_t, vmax, u16)
-VMAX_IMPL(int16_t, int16x4_t, vmax, s16)
-VMAX_IMPL(uint32_t, uint32x2_t, vmax, u32)
-VMAX_IMPL(int32_t, int32x2_t, vmax, s32)
-VMAX_IMPL(float, float32x2_t, vmax, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMAX_IMPL(float16_t, float16x4_t, vmax, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMAX_IMPL(uint8_t, uint8x16_t, vmaxq, u8)
-VMAX_IMPL(int8_t, int8x16_t, vmaxq, s8)
-VMAX_IMPL(uint16_t, uint16x8_t, vmaxq, u16)
-VMAX_IMPL(int16_t, int16x8_t, vmaxq, s16)
-VMAX_IMPL(uint32_t, uint32x4_t, vmaxq, u32)
-VMAX_IMPL(int32_t, int32x4_t, vmaxq, s32)
-VMAX_IMPL(float, float32x4_t, vmaxq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMAX_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/min.h b/arm_compute/core/NEON/wrapper/intrinsics/min.h
deleted file mode 100644
index dc8a127e82..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/min.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MIN_H
-#define ARM_COMPUTE_WRAPPER_MIN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMIN_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vmin(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VMIN_IMPL(uint8_t, uint8x8_t, vmin, u8)
-VMIN_IMPL(int8_t, int8x8_t, vmin, s8)
-VMIN_IMPL(uint16_t, uint16x4_t, vmin, u16)
-VMIN_IMPL(int16_t, int16x4_t, vmin, s16)
-VMIN_IMPL(uint32_t, uint32x2_t, vmin, u32)
-VMIN_IMPL(int32_t, int32x2_t, vmin, s32)
-VMIN_IMPL(float, float32x2_t, vmin, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMIN_IMPL(float16_t, float16x4_t, vmin, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMIN_IMPL(uint8_t, uint8x16_t, vminq, u8)
-VMIN_IMPL(int8_t, int8x16_t, vminq, s8)
-VMIN_IMPL(uint16_t, uint16x8_t, vminq, u16)
-VMIN_IMPL(int16_t, int16x8_t, vminq, s16)
-VMIN_IMPL(uint32_t, uint32x4_t, vminq, u32)
-VMIN_IMPL(int32_t, int32x4_t, vminq, s32)
-VMIN_IMPL(float, float32x4_t, vminq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMIN_IMPL(float16_t, float16x8_t, vminq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMIN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MIN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mla.h b/arm_compute/core/NEON/wrapper/intrinsics/mla.h
deleted file mode 100644
index dd2f0c0d9d..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/mla.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MLA_H
-#define ARM_COMPUTE_WRAPPER_MLA_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMLA_IMPL(stype, vtype, prefix, postfix)                      \
-    inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \
-    {                                                                 \
-        return prefix##_##postfix(a, b, c);                           \
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#define VMLA_IMPL2(stype, vtype, prefix1, prefix2, postfix)           \
-    inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \
-    {                                                                 \
-        return prefix1##_##postfix(a, prefix2##_##postfix(b, c));     \
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMLA_IMPL(uint8x8_t, uint8x8_t, vmla, u8)
-VMLA_IMPL(int8x8_t, int8x8_t, vmla, s8)
-VMLA_IMPL(uint16x4_t, uint16x4_t, vmla, u16)
-VMLA_IMPL(int16x4_t, int16x4_t, vmla, s16)
-VMLA_IMPL(uint32x2_t, uint32x2_t, vmla, u32)
-VMLA_IMPL(int32x2_t, int32x2_t, vmla, s32)
-VMLA_IMPL(float32x2_t, float32x2_t, vmla, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMLA_IMPL2(float16x4_t, float16x4_t, vadd, vmul, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMLA_IMPL(uint8x16_t, uint8x16_t, vmlaq, u8)
-VMLA_IMPL(int8x16_t, int8x16_t, vmlaq, s8)
-VMLA_IMPL(uint16x8_t, uint16x8_t, vmlaq, u16)
-VMLA_IMPL(int16x8_t, int16x8_t, vmlaq, s16)
-VMLA_IMPL(uint32x4_t, uint32x4_t, vmlaq, u32)
-VMLA_IMPL(int32x4_t, int32x4_t, vmlaq, s32)
-VMLA_IMPL(float32x4_t, float32x4_t, vmlaq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMLA_IMPL2(float16x8_t, float16x8_t, vaddq, vmulq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMLA_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MLA_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movl.h b/arm_compute/core/NEON/wrapper/intrinsics/movl.h
deleted file mode 100644
index 982a795924..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/movl.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MOVL_H
-#define ARM_COMPUTE_WRAPPER_MOVL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMOVL_IMPL(ptype, vtype, prefix, postfix) \
-    inline ptype vmovl(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-VMOVL_IMPL(uint16x8_t, uint8x8_t, vmovl, u8)
-VMOVL_IMPL(int16x8_t, int8x8_t, vmovl, s8)
-VMOVL_IMPL(uint32x4_t, uint16x4_t, vmovl, u16)
-VMOVL_IMPL(int32x4_t, int16x4_t, vmovl, s16)
-VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32)
-VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32)
-
-#undef VMOVL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MOVL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movn.h b/arm_compute/core/NEON/wrapper/intrinsics/movn.h
deleted file mode 100644
index 23360e2597..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/movn.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MOVN_H
-#define ARM_COMPUTE_WRAPPER_MOVN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMOVN_IMPL(dtype, vtype, prefix, postfix) \
-    inline dtype vmovn(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-VMOVN_IMPL(uint32x2_t, uint64x2_t, vmovn, u64)
-VMOVN_IMPL(int32x2_t, int64x2_t, vmovn, s64)
-VMOVN_IMPL(uint16x4_t, uint32x4_t, vmovn, u32)
-VMOVN_IMPL(int16x4_t, int32x4_t, vmovn, s32)
-VMOVN_IMPL(uint8x8_t, uint16x8_t, vmovn, u16)
-VMOVN_IMPL(int8x8_t, int16x8_t, vmovn, s16)
-
-#define VQMOVN_IMPL(dtype, vtype, prefix, postfix) \
-    inline dtype vqmovn(const vtype &a)            \
-    {                                              \
-        return prefix##_##postfix(a);              \
-    }
-
-VQMOVN_IMPL(uint32x2_t, uint64x2_t, vqmovn, u64)
-VQMOVN_IMPL(int32x2_t, int64x2_t, vqmovn, s64)
-VQMOVN_IMPL(uint16x4_t, uint32x4_t, vqmovn, u32)
-VQMOVN_IMPL(int16x4_t, int32x4_t, vqmovn, s32)
-VQMOVN_IMPL(uint8x8_t, uint16x8_t, vqmovn, u16)
-VQMOVN_IMPL(int8x8_t, int16x8_t, vqmovn, s16)
-
-#undef VMOVN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MOVN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mul.h b/arm_compute/core/NEON/wrapper/intrinsics/mul.h
deleted file mode 100644
index bbf70abac9..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/mul.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MUL_H
-#define ARM_COMPUTE_WRAPPER_MUL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMUL_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vmul(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VMUL_IMPL(uint8x8_t, uint8x8_t, vmul, u8)
-VMUL_IMPL(int8x8_t, int8x8_t, vmul, s8)
-VMUL_IMPL(uint16x4_t, uint16x4_t, vmul, u16)
-VMUL_IMPL(int16x4_t, int16x4_t, vmul, s16)
-VMUL_IMPL(uint32x2_t, uint32x2_t, vmul, u32)
-VMUL_IMPL(int32x2_t, int32x2_t, vmul, s32)
-VMUL_IMPL(float32x2_t, float32x2_t, vmul, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMUL_IMPL(float16_t, float16x4_t, vmul, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMUL_IMPL(uint8_t, uint8x16_t, vmulq, u8)
-VMUL_IMPL(int8_t, int8x16_t, vmulq, s8)
-VMUL_IMPL(uint16_t, uint16x8_t, vmulq, u16)
-VMUL_IMPL(int16_t, int16x8_t, vmulq, s16)
-VMUL_IMPL(uint32_t, uint32x4_t, vmulq, u32)
-VMUL_IMPL(int32_t, int32x4_t, vmulq, s32)
-VMUL_IMPL(float32x4_t, float32x4_t, vmulq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMUL_IMPL(float16_t, float16x8_t, vmulq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMUL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MUL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/neg.h b/arm_compute/core/NEON/wrapper/intrinsics/neg.h
deleted file mode 100644
index da2f285eca..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/neg.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_NEG_H
-#define ARM_COMPUTE_WRAPPER_NEG_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VNEG_IMPL(vtype, prefix, postfix) \
-    inline vtype vneg(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-VNEG_IMPL(int8x8_t, vneg, s8)
-VNEG_IMPL(int16x4_t, vneg, s16)
-VNEG_IMPL(int32x2_t, vneg, s32)
-VNEG_IMPL(float32x2_t, vneg, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNEG_IMPL(float16x4_t, vneg, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VNEG_IMPL(int8x16_t, vnegq, s8)
-VNEG_IMPL(int16x8_t, vnegq, s16)
-VNEG_IMPL(int32x4_t, vnegq, s32)
-VNEG_IMPL(float32x4_t, vnegq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNEG_IMPL(float16x8_t, vnegq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VNEG_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_NEG_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/not.h b/arm_compute/core/NEON/wrapper/intrinsics/not.h
deleted file mode 100644
index 5b1e4056ca..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/not.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_NOT_H
-#define ARM_COMPUTE_WRAPPER_NOT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VNOT_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vnot(const vtype &a)            \
-    {                                            \
-        return prefix##_##postfix(a);            \
-    }
-
-VNOT_IMPL(uint8_t, uint8x8_t, vmvn, u8)
-VNOT_IMPL(int8_t, int8x8_t, vmvn, s8)
-VNOT_IMPL(uint16_t, uint16x4_t, vmvn, u16)
-VNOT_IMPL(int16_t, int16x4_t, vmvn, s16)
-VNOT_IMPL(uint32_t, uint32x2_t, vmvn, u32)
-VNOT_IMPL(int32_t, int32x2_t, vmvn, s32)
-VNOT_IMPL(float32x2_t, float32x2_t, vinv, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNOT_IMPL(float16x4_t, float16x4_t, vinv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VNOT_IMPL(uint8_t, uint8x16_t, vmvnq, u8)
-VNOT_IMPL(int8_t, int8x16_t, vmvnq, s8)
-VNOT_IMPL(uint16_t, uint16x8_t, vmvnq, u16)
-VNOT_IMPL(int16_t, int16x8_t, vmvnq, s16)
-VNOT_IMPL(uint32_t, uint32x4_t, vmvnq, u32)
-VNOT_IMPL(int32_t, int32x4_t, vmvnq, s32)
-VNOT_IMPL(float32x4_t, float32x4_t, vinvq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNOT_IMPL(float16x8_t, float16x8_t, vinvq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VNOT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_NOT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/arm_compute/core/NEON/wrapper/intrinsics/orr.h
deleted file mode 100644
index 0fbdd44c76..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/orr.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ORR_H
-#define ARM_COMPUTE_WRAPPER_ORR_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VORR_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vorr(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VORR_IMPL(uint8_t, uint8x8_t, vorr, u8)
-VORR_IMPL(int8_t, int8x8_t, vorr, s8)
-VORR_IMPL(uint16_t, uint16x4_t, vorr, u16)
-VORR_IMPL(int16_t, int16x4_t, vorr, s16)
-VORR_IMPL(uint32_t, uint32x2_t, vorr, u32)
-VORR_IMPL(int32_t, int32x2_t, vorr, s32)
-VORR_IMPL(uint64_t, uint64x1_t, vorr, u64)
-VORR_IMPL(int64_t, int64x1_t, vorr, s64)
-
-VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8)
-VORR_IMPL(int8_t, int8x16_t, vorrq, s8)
-VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16)
-VORR_IMPL(int16_t, int16x8_t, vorrq, s16)
-VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32)
-VORR_IMPL(int32_t, int32x4_t, vorrq, s32)
-VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64)
-VORR_IMPL(int64_t, int64x2_t, vorrq, s64)
-
-#undef VORR_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ORR_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
deleted file mode 100644
index afad27f1e4..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_PMAX_H
-#define ARM_COMPUTE_WRAPPER_PMAX_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VPMAX_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vpmax(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8)
-VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8)
-VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16)
-VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16)
-VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32)
-VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32)
-VPMAX_IMPL(float, float32x2_t, vpmax, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPMAX_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_PMAX_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
deleted file mode 100644
index 77c5cf61ba..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_PMIN_H
-#define ARM_COMPUTE_WRAPPER_PMIN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VPMIN_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vpmin(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8)
-VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8)
-VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16)
-VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16)
-VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32)
-VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32)
-VPMIN_IMPL(float, float32x2_t, vpmin, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPMIN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_PMIN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pow.h b/arm_compute/core/NEON/wrapper/intrinsics/pow.h
deleted file mode 100644
index 1b5d62df5e..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/pow.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_POW_H
-#define ARM_COMPUTE_WRAPPER_POW_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VPOW_IMPL(vtype, prefix, postfix)             \
-    inline vtype vpow(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VPOW_IMPL(float32x4_t, vpowq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPOW_IMPL(float16x8_t, vpowq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPOW_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_POW_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h b/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h
deleted file mode 100644
index a0347020db..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_QMOVUN_H
-#define ARM_COMPUTE_WRAPPER_QMOVUN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VQMOVUN_IMPL(dtype, vtype, prefix, postfix) \
-    inline dtype vqmovun(const vtype &a)            \
-    {                                               \
-        return prefix##_##postfix(a);               \
-    }
-
-VQMOVUN_IMPL(uint32x2_t, int64x2_t, vqmovun, s64)
-VQMOVUN_IMPL(uint16x4_t, int32x4_t, vqmovun, s32)
-VQMOVUN_IMPL(uint8x8_t, int16x8_t, vqmovun, s16)
-
-#undef VQMOVUN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_QMOVUN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
deleted file mode 100644
index 579da344a7..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_REINTERPRET_H
-#define ARM_COMPUTE_WRAPPER_REINTERPRET_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VREINTERPRET_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
-    inline ptype vreinterpret(const vtype &a)                       \
-    {                                                               \
-        return prefix##_##postfix1##_##postfix2(a);                 \
-    }                                                               \
-    \
-    inline ptype vreinterpret(const ptype &a)                       \
-    {                                                               \
-        return a;                                                   \
-    }
-
-VREINTERPRET_IMPL(int16x4_t, uint16x4_t, vreinterpret, s16, u16)
-
-VREINTERPRET_IMPL(int32x4_t, uint32x4_t, vreinterpretq, s32, u32)
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_REINTERPRET_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h b/arm_compute/core/NEON/wrapper/intrinsics/rev64.h
deleted file mode 100644
index 0385704f3f..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_REV64_H
-#define ARM_COMPUTE_WRAPPER_REV64_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VREV64_IMPL(vtype, prefix, postfix) \
-    inline vtype vrev64(const vtype &a)     \
-    {                                       \
-        return prefix##_##postfix(a);       \
-    }
-
-VREV64_IMPL(uint8x8_t, vrev64, u8)
-VREV64_IMPL(int8x8_t, vrev64, s8)
-VREV64_IMPL(uint16x4_t, vrev64, u16)
-VREV64_IMPL(int16x4_t, vrev64, s16)
-VREV64_IMPL(uint32x2_t, vrev64, u32)
-VREV64_IMPL(int32x2_t, vrev64, s32)
-VREV64_IMPL(float32x2_t, vrev64, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VREV64_IMPL(float16x4_t, vrev64, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VREV64_IMPL(uint8x16_t, vrev64q, u8)
-VREV64_IMPL(int8x16_t, vrev64q, s8)
-VREV64_IMPL(uint16x8_t, vrev64q, u16)
-VREV64_IMPL(int16x8_t, vrev64q, s16)
-VREV64_IMPL(uint32x4_t, vrev64q, u32)
-VREV64_IMPL(int32x4_t, vrev64q, s32)
-VREV64_IMPL(float32x4_t, vrev64q, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VREV64_IMPL(float16x8_t, vrev64q, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VREV64_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_REV64_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/round.h b/arm_compute/core/NEON/wrapper/intrinsics/round.h
deleted file mode 100644
index d6f5a88689..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/round.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ROUND_H
-#define ARM_COMPUTE_WRAPPER_ROUND_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VROUNDQ_IMPL(vtype, postfix)     \
-    inline vtype vround(const vtype &a)  \
-    {                                    \
-        return vroundq_rte_##postfix(a); \
-    }
-
-#define VROUNDQ_IMPL_INT(vtype, postfix)    \
-    inline vtype vround(const vtype &a)     \
-    {                                       \
-        ARM_COMPUTE_UNUSED(a);              \
-        ARM_COMPUTE_ERROR("Not supported"); \
-    }
-
-VROUNDQ_IMPL(float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VROUNDQ_IMPL(float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VROUNDQ_IMPL_INT(int32x4_t, s32)
-#undef VROUNDQ_IMPL
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ROUND_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h b/arm_compute/core/NEON/wrapper/intrinsics/setlane.h
deleted file mode 100644
index 6332f3025e..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SET_LANE_H
-#define ARM_COMPUTE_WRAPPER_SET_LANE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSETLANE_IMPL_8(stype, atype, vtype, postfix)                                     \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vset_lane_##postfix(value, vector, 0);                             \
-            case 1:                                                                       \
-                return vset_lane_##postfix(value, vector, 1);                             \
-            case 2:                                                                       \
-                return vset_lane_##postfix(value, vector, 2);                             \
-            case 3:                                                                       \
-                return vset_lane_##postfix(value, vector, 3);                             \
-            case 4:                                                                       \
-                return vset_lane_##postfix(value, vector, 4);                             \
-            case 5:                                                                       \
-                return vset_lane_##postfix(value, vector, 5);                             \
-            case 6:                                                                       \
-                return vset_lane_##postfix(value, vector, 6);                             \
-            case 7:                                                                       \
-                return vset_lane_##postfix(value, vector, 7);                             \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETLANE_IMPL_4(stype, atype, vtype, postfix)                                     \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vset_lane_##postfix(value, vector, 0);                             \
-            case 1:                                                                       \
-                return vset_lane_##postfix(value, vector, 1);                             \
-            case 2:                                                                       \
-                return vset_lane_##postfix(value, vector, 2);                             \
-            case 3:                                                                       \
-                return vset_lane_##postfix(value, vector, 3);                             \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETLANE_IMPL_2(stype, atype, vtype, postfix)                                     \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vset_lane_##postfix(value, vector, 0);                             \
-            case 1:                                                                       \
-                return vset_lane_##postfix(value, vector, 1);                             \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-VSETLANE_IMPL_8(uint8x8_t, uint8_t, uint8x8_t, u8)
-VSETLANE_IMPL_8(int8x8_t, int8_t, int8x8_t, s8)
-VSETLANE_IMPL_4(uint16x4_t, uint16_t, uint16x4_t, u16)
-VSETLANE_IMPL_4(int16x4_t, int16_t, int16x4_t, s16)
-VSETLANE_IMPL_2(uint32x2_t, uint32_t, uint32x2_t, u32)
-VSETLANE_IMPL_2(int32x2_t, int32_t, int32x2_t, s32)
-VSETLANE_IMPL_2(float32x2_t, float, float32x2_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#define VSETQLANE_IMPL_16(stype, atype, vtype, postfix)                                   \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 0);                            \
-            case 1:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 1);                            \
-            case 2:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 2);                            \
-            case 3:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 3);                            \
-            case 4:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 4);                            \
-            case 5:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 5);                            \
-            case 6:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 6);                            \
-            case 7:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 7);                            \
-            case 8:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 8);                            \
-            case 9:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 9);                            \
-            case 10:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 10);                           \
-            case 11:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 11);                           \
-            case 12:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 12);                           \
-            case 13:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 13);                           \
-            case 14:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 14);                           \
-            case 15:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 15);                           \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETQLANE_IMPL_8(stype, atype, vtype, postfix)                                    \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 0);                            \
-            case 1:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 1);                            \
-            case 2:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 2);                            \
-            case 3:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 3);                            \
-            case 4:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 4);                            \
-            case 5:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 5);                            \
-            case 6:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 6);                            \
-            case 7:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 7);                            \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETQLANE_IMPL_4(stype, atype, vtype, postfix)                                    \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 0);                            \
-            case 1:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 1);                            \
-            case 2:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 2);                            \
-            case 3:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 3);                            \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-VSETQLANE_IMPL_16(uint8x16_t, uint8_t, uint8x16_t, u8)
-VSETQLANE_IMPL_16(int8x16_t, int8_t, int8x16_t, s8)
-VSETQLANE_IMPL_8(uint16x8_t, uint16_t, uint16x8_t, u16)
-VSETQLANE_IMPL_8(int16x8_t, int16_t, int16x8_t, s16)
-VSETQLANE_IMPL_4(uint32x4_t, uint32_t, uint32x4_t, u32)
-VSETQLANE_IMPL_4(int32x4_t, int32_t, int32x4_t, s32)
-VSETQLANE_IMPL_4(float32x4_t, float, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSETQLANE_IMPL_8(float16x8_t, float16_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VSETLANE_IMPL_8
-#undef VSETLANE_IMPL_4
-#undef VSETLANE_IMPL_2
-
-#undef VSETQLANE_IMPL_16
-#undef VSETQLANE_IMPL_8
-#undef VSETQLANE_IMPL_4
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SET_LANE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sin.h b/arm_compute/core/NEON/wrapper/intrinsics/sin.h
deleted file mode 100644
index bca72db38a..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/sin.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SIN_H
-#define ARM_COMPUTE_WRAPPER_SIN_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSIN_IMPL(vtype, prefix, postfix) \
-    inline vtype vsin(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-#define VSIN_IMPL_INT(vtype, prefix, postfix) \
-    inline vtype vsin(const vtype &a)         \
-    {                                         \
-        ARM_COMPUTE_UNUSED(a);                \
-        ARM_COMPUTE_ERROR("Not supported");   \
-    }
-
-VSIN_IMPL(float32x4_t, vsinq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSIN_IMPL(float16x8_t, vsinq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VSIN_IMPL_INT(int32x4_t, vsinq, s32)
-
-#undef vsub_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
-\ No newline at end of file
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/store.h b/arm_compute/core/NEON/wrapper/intrinsics/store.h
deleted file mode 100644
index eb2ae6a5e1..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/store.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_STORE_H
-#define ARM_COMPUTE_WRAPPER_STORE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSTORE_IMPL(stype, vtype, prefix, postfix) \
-    inline void vstore(stype *ptr, vtype val)      \
-    {                                              \
-        prefix##_##postfix(ptr, val);              \
-    }
-
-VSTORE_IMPL(uint8_t, uint8x8_t, vst1, u8)
-VSTORE_IMPL(uint8_t, uint8x8x2_t, vst2, u8)
-VSTORE_IMPL(int8_t, int8x8_t, vst1, s8)
-VSTORE_IMPL(int8_t, int8x8x2_t, vst2, s8)
-VSTORE_IMPL(uint16_t, uint16x4_t, vst1, u16)
-VSTORE_IMPL(int16_t, int16x4_t, vst1, s16)
-VSTORE_IMPL(uint32_t, uint32x2_t, vst1, u32)
-VSTORE_IMPL(int32_t, int32x2_t, vst1, s32)
-//VSTORE_IMPL(uint64_t, 1, vst1, u64)
-//VSTORE_IMPL(int64_t, 1, vst1, s64)
-VSTORE_IMPL(float, float32x2_t, vst1, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSTORE_IMPL(float16_t, float16x4_t, vst1, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VSTORE_IMPL(uint8_t, uint8x16_t, vst1q, u8)
-VSTORE_IMPL(int8_t, int8x16_t, vst1q, s8)
-VSTORE_IMPL(uint16_t, uint16x8_t, vst1q, u16)
-VSTORE_IMPL(int16_t, int16x8_t, vst1q, s16)
-VSTORE_IMPL(uint32_t, uint32x4_t, vst1q, u32)
-VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32)
-//VSTORE_IMPL(uint64_t, 2, vst1q, u64)
-//VSTORE_IMPL(int64_t, 2, vst1q, s64)
-VSTORE_IMPL(float, float32x4_t, vst1q, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VSTORE_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_STORE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sub.h b/arm_compute/core/NEON/wrapper/intrinsics/sub.h
deleted file mode 100644
index f46b57c815..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/sub.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SUB_H
-#define ARM_COMPUTE_WRAPPER_SUB_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSUB_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vsub(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VSUB_IMPL(uint8x8_t, uint8x8_t, vsub, u8)
-VSUB_IMPL(int8x8_t, int8x8_t, vsub, s8)
-VSUB_IMPL(uint16x4_t, uint16x4_t, vsub, u16)
-VSUB_IMPL(int16x4_t, int16x4_t, vsub, s16)
-VSUB_IMPL(uint32x2_t, uint32x2_t, vsub, u32)
-VSUB_IMPL(int32x2_t, int32x2_t, vsub, s32)
-VSUB_IMPL(uint64x1_t, uint64x1_t, vsub, u64)
-VSUB_IMPL(int64x1_t, int64x1_t, vsub, s64)
-VSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VSUB_IMPL(uint8x16_t, uint8x16_t, vsubq, u8)
-VSUB_IMPL(int8x16_t, int8x16_t, vsubq, s8)
-VSUB_IMPL(uint16x8_t, uint16x8_t, vsubq, u16)
-VSUB_IMPL(int16x8_t, int16x8_t, vsubq, s16)
-VSUB_IMPL(uint32x4_t, uint32x4_t, vsubq, u32)
-VSUB_IMPL(int32x4_t, int32x4_t, vsubq, s32)
-VSUB_IMPL(uint64x2_t, uint64x2_t, vsubq, u64)
-VSUB_IMPL(int64x2_t, int64x2_t, vsubq, s64)
-VSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VSUB_IMPL
-
-// VQSUB: Vector saturating sub (No notion of saturation for floating point)
-#define VQSUB_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vqsub(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VQSUB_IMPL(uint8x8_t, uint8x8_t, vqsub, u8)
-VQSUB_IMPL(int8x8_t, int8x8_t, vqsub, s8)
-VQSUB_IMPL(uint16x4_t, uint16x4_t, vqsub, u16)
-VQSUB_IMPL(int16x4_t, int16x4_t, vqsub, s16)
-VQSUB_IMPL(uint32x2_t, uint32x2_t, vqsub, u32)
-VQSUB_IMPL(int32x2_t, int32x2_t, vqsub, s32)
-VQSUB_IMPL(uint64x1_t, uint64x1_t, vqsub, u64)
-VQSUB_IMPL(int64x1_t, int64x1_t, vqsub, s64)
-VQSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VQSUB_IMPL(uint8x16_t, uint8x16_t, vqsubq, u8)
-VQSUB_IMPL(int8x16_t, int8x16_t, vqsubq, s8)
-VQSUB_IMPL(uint16x8_t, uint16x8_t, vqsubq, u16)
-VQSUB_IMPL(int16x8_t, int16x8_t, vqsubq, s16)
-VQSUB_IMPL(uint32x4_t, uint32x4_t, vqsubq, u32)
-VQSUB_IMPL(int32x4_t, int32x4_t, vqsubq, s32)
-VQSUB_IMPL(uint64x2_t, uint64x2_t, vqsubq, u64)
-VQSUB_IMPL(int64x2_t, int64x2_t, vqsubq, s64)
-VQSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VQSUB_IMPL
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h b/arm_compute/core/NEON/wrapper/intrinsics/tanh.h
deleted file mode 100644
index 648a001ca7..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_TANH_H
-#define ARM_COMPUTE_WRAPPER_TANH_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VTANH_IMPL(vtype, prefix, postfix) \
-    inline vtype vtanh(const vtype &a)     \
-    {                                      \
-        return prefix##_##postfix(a);      \
-    }
-
-VTANH_IMPL(float32x4_t, vtanhq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VTANH_IMPL(float16x8_t, vtanhq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VTANH_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TANH_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h b/arm_compute/core/NEON/wrapper/intrinsics/tbl.h
deleted file mode 100644
index d3d6b72e6a..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_TBL_H
-#define ARM_COMPUTE_WRAPPER_TBL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VTBL_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vtbl(const stype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VTBL_IMPL(uint8x8x2_t, uint8x8_t, vtbl2, u8)
-VTBL_IMPL(int8x8x2_t, int8x8_t, vtbl2, s8)
-
-#undef VTBL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TBL_H */
diff --git a/arm_compute/core/NEON/wrapper/scalar/add.h b/arm_compute/core/NEON/wrapper/scalar/add.h
deleted file mode 100644
index 5a04fe20fa..0000000000
--- a/arm_compute/core/NEON/wrapper/scalar/add.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SCALAR_ADD_H
-#define ARM_COMPUTE_WRAPPER_SCALAR_ADD_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
-{
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
-    return vget_lane_u8(vqadd_u8(va, vb), 0);
-}
-
-inline int16_t add_sat(const int16_t &a, const int16_t &b)
-{
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
-    return vget_lane_s16(vqadd_s16(va, vb), 0);
-}
-
-inline float add_sat(const float &a, const float &b)
-{
-    // No notion of saturation exists in floating point
-    return a + b;
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline float16_t add_sat(const float16_t &a, const float16_t &b)
-{
-    // No notion of saturation exists in floating point
-    return a + b;
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SCALAR_ADD_H */
diff --git a/arm_compute/core/NEON/wrapper/scalar/scalar.h b/arm_compute/core/NEON/wrapper/scalar/scalar.h
deleted file mode 100644
index ff2d807c0e..0000000000
--- a/arm_compute/core/NEON/wrapper/scalar/scalar.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SCALAR_H
-#define ARM_COMPUTE_WRAPPER_SCALAR_H
-
-#include "arm_compute/core/NEON/wrapper/scalar/add.h"
-#include "arm_compute/core/NEON/wrapper/scalar/sub.h"
-
-#endif /* ARM_COMPUTE_WRAPPER_SCALAR_H */
diff --git a/arm_compute/core/NEON/wrapper/scalar/sub.h b/arm_compute/core/NEON/wrapper/scalar/sub.h
deleted file mode 100644
index 5b4cab93d3..0000000000
--- a/arm_compute/core/NEON/wrapper/scalar/sub.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
-#define ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
-{
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
-    return vget_lane_u8(vqsub_u8(va, vb), 0);
-}
-
-inline int16_t sub_sat(const int16_t &a, const int16_t &b)
-{
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
-    return vget_lane_s16(vqsub_s16(va, vb), 0);
-}
-
-inline float sub_sat(const float &a, const float &b)
-{
-    // No notion of saturation exists in floating point
-    return a - b;
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline float16_t sub_sat(const float16_t &a, const float16_t &b)
-{
-    // No notion of saturation exists in floating point
-    return a - b;
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SCALAR_SUB_H */
diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h
deleted file mode 100644
index ae77d2778c..0000000000
--- a/arm_compute/core/NEON/wrapper/traits.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H
-#define ARM_COMPUTE_WRAPPER_TRAITS_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-namespace traits
-{
-// *INDENT-OFF*
-// clang-format off
-
-/** 64-bit vector tag */
-struct vector_64_tag {};
-/** 128-bit vector tag */
-struct vector_128_tag {};
-
-/** Create the appropriate NEON vector given its type and size in terms of elements */
-template <typename T, int S> struct neon_vector;
-
-// Specializations
-#ifndef DOXYGEN_SKIP_THIS
-template <> struct neon_vector<uint8_t, 8>{ using scalar_type = uint8_t; using type = uint8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int8_t, 8>{ using scalar_type = int8_t; using type = int8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint8_t, 16>{ using scalar_type = uint8_t; using type = uint8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int8_t, 16>{ using scalar_type = int8_t; using type = int8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<uint16_t, 4>{ using scalar_type = uint16_t; using type = uint16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int16_t, 4>{ using scalar_type = int16_t; using type = int16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint16_t, 8>{ using scalar_type = uint16_t; using type = uint16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<uint16_t, 16>{ using scalar_type = uint16_t; using type = uint16x8x2_t; };
-template <> struct neon_vector<int16_t, 8>{ using scalar_type = int16_t; using type = int16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int16_t, 16>{ using scalar_type = int16_t; using type = int16x8x2_t; };
-template <> struct neon_vector<uint32_t, 2>{ using scalar_type = uint32_t; using type = uint32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int32_t, 2>{ using scalar_type = int32_t; using type = int32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint32_t, 4>{ using scalar_type = uint32_t; using type = uint32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int32_t, 4>{ using scalar_type = int32_t; using type = int32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<uint64_t, 1>{ using scalar_type = uint64_t;using type = uint64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int64_t, 1>{ using scalar_type = int64_t; using type = int64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint64_t, 2>{ using scalar_type = uint64_t; using type = uint64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int64_t, 2>{ using scalar_type = int64_t; using type = int64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<float_t, 2>{ using scalar_type = float_t; using type = float32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<float_t, 4>{ using scalar_type = float_t; using type = float32x4_t; using tag_type = vector_128_tag; };
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <> struct neon_vector<float16_t, 4>{ using scalar_type = float16_t; using type = float16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<float16_t, 8>{ using scalar_type = float16_t; using type = float16x8_t; using tag_type = vector_128_tag; };
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#endif /* DOXYGEN_SKIP_THIS */
-
-/**  Helper type template to get the type of a neon vector */
-template <typename T, int S> using neon_vector_t = typename neon_vector<T, S>::type;
-/**  Helper type template to get the tag type of a neon vector */
-template <typename T, int S> using neon_vector_tag_t = typename neon_vector<T, S>::tag_type;
-
-/** Vector bit-width enum class */
-enum class BitWidth
-{
-    W64,  /**< 64-bit width */
-    W128, /**< 128-bit width */
-};
-
-/** Create the appropriate NEON vector given its type and size in terms of bits */
-template <typename T, BitWidth BW> struct neon_bitvector;
-// Specializations
-#ifndef DOXYGEN_SKIP_THIS
-template <> struct neon_bitvector<uint8_t, BitWidth::W64>{ using type = uint8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int8_t, BitWidth::W64>{ using type = int8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint8_t, BitWidth::W128>{ using type = uint8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int8_t, BitWidth::W128>{ using type = int8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<uint16_t, BitWidth::W64>{ using type = uint16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int16_t, BitWidth::W64>{ using type = int16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint16_t, BitWidth::W128>{ using type = uint16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int16_t, BitWidth::W128>{ using type = int16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<uint32_t, BitWidth::W64>{ using type = uint32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int32_t, BitWidth::W64>{ using type = int32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint32_t, BitWidth::W128>{ using type = uint32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int32_t, BitWidth::W128>{ using type = int32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<uint64_t, BitWidth::W64>{ using type = uint64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int64_t, BitWidth::W64>{ using type = int64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint64_t, BitWidth::W128>{ using type = uint64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int64_t, BitWidth::W128>{ using type = int64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<float_t, BitWidth::W64>{ using type = float32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<float_t, BitWidth::W128>{ using type = float32x4_t; using tag_type = vector_128_tag; };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <> struct neon_bitvector<float16_t, BitWidth::W64>{ using type = float16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float16x8_t; using tag_type = vector_128_tag; };
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#endif /* DOXYGEN_SKIP_THIS */
-
-/**  Helper type template to get the type of a neon vector */
-template <typename T, BitWidth BW> using neon_bitvector_t = typename neon_bitvector<T, BW>::type;
-/**  Helper type template to get the tag type of a neon vector */
-template <typename T, BitWidth BW> using neon_bitvector_tag_t = typename neon_bitvector<T, BW>::tag_type;
-
-/** Promote a type */
-template <typename T> struct promote { };
-template <> struct promote<uint8_t> { using type = uint16_t; };
-template <> struct promote<int8_t> { using type = int16_t; };
-template <> struct promote<uint16_t> { using type = uint32_t; };
-template <> struct promote<int16_t> { using type = int32_t; };
-template <> struct promote<uint32_t> { using type = uint64_t; };
-template <> struct promote<int32_t> { using type = int64_t; };
-template <> struct promote<float> { using type = float; };
-template <> struct promote<half> { using type = half; };
-
-/** Get promoted type */
-template <typename T>
-using promote_t = typename promote<T>::type;
-
-// clang-format on
-// *INDENT-ON*
-} // namespace traits
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */
diff --git a/arm_compute/core/NEON/wrapper/wrapper.h b/arm_compute/core/NEON/wrapper/wrapper.h
deleted file mode 100644
index 99a5909e8b..0000000000
--- a/arm_compute/core/NEON/wrapper/wrapper.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_H
-#define ARM_COMPUTE_WRAPPER_H
-
-// Traits
-#include "arm_compute/core/NEON/wrapper/traits.h"
-
-// Intrinsics Overloads
-#include "arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "arm_compute/core/NEON/wrapper/scalar/scalar.h"
-
-#endif /* ARM_COMPUTE_WRAPPER_H */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 337ccbc3f7..0b4df4f2e2 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_PIXELVALUE_H
 #define ARM_COMPUTE_PIXELVALUE_H
 
+#include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Types.h"
 
 #include <cstdint>
@@ -35,8 +36,7 @@ class PixelValue
 {
 public:
     /** Default constructor: value initialized to 0 */
-    PixelValue()
-        : value{ int64_t(0) }
+    PixelValue() noexcept : value{int64_t(0)}
     {
     }
     /** Initialize the union with a pixel value of chosen datatype
@@ -45,10 +45,9 @@ public:
      * @param[in] datatype DataType that @p v have to be stored
      * @param[in] qinfo    (Optional) QuantizationInfo to apply in case of quantized data types to @p v
      */
-    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo())
-        : PixelValue()
+    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo()) : PixelValue()
     {
-        switch(datatype)
+        switch (datatype)
         {
             case DataType::U8:
                 value.u8 = static_cast<uint8_t>(v);
@@ -108,8 +107,7 @@ public:
      *
      * @param[in] v S8 value.
      */
-    PixelValue(int8_t v)
-        : PixelValue()
+    PixelValue(int8_t v) : PixelValue()
     {
         value.s8 = v;
     }
@@ -117,8 +115,7 @@ public:
      *
      * @param[in] v U8 value.
      */
-    PixelValue(uint8_t v)
-        : PixelValue()
+    PixelValue(uint8_t v) : PixelValue()
     {
         value.u8 = v;
     }
@@ -126,8 +123,7 @@ public:
      *
      * @param[in] v U16 value.
      */
-    PixelValue(uint16_t v)
-        : PixelValue()
+    PixelValue(uint16_t v) : PixelValue()
     {
         value.u16 = v;
     }
@@ -135,8 +131,7 @@ public:
      *
      * @param[in] v S16 value.
      */
-    PixelValue(int16_t v)
-        : PixelValue()
+    PixelValue(int16_t v) : PixelValue()
     {
         value.s16 = v;
     }
@@ -144,8 +139,7 @@ public:
      *
      * @param[in] v U32 value.
      */
-    PixelValue(uint32_t v)
-        : PixelValue()
+    PixelValue(uint32_t v) : PixelValue()
     {
         value.u32 = v;
     }
@@ -153,8 +147,7 @@ public:
      *
      * @param[in] v S32 value.
      */
-    PixelValue(int32_t v)
-        : PixelValue()
+    PixelValue(int32_t v) : PixelValue()
     {
         value.s32 = v;
     }
@@ -163,8 +156,7 @@ public:
      *
      * @param[in] v U64 value.
      */
-    PixelValue(uint64_t v)
-        : PixelValue()
+    PixelValue(uint64_t v) : PixelValue()
     {
         value.u64 = v;
     }
@@ -172,8 +164,7 @@ public:
      *
      * @param[in] v S64 value.
      */
-    PixelValue(int64_t v)
-        : PixelValue()
+    PixelValue(int64_t v) : PixelValue()
     {
         value.s64 = v;
     }
@@ -181,8 +172,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(bfloat16 v)
-        : PixelValue()
+    PixelValue(bfloat16 v) : PixelValue()
     {
         value.bf16 = v;
     }
@@ -190,8 +180,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(half v)
-        : PixelValue()
+    PixelValue(half v) : PixelValue()
     {
         value.f16 = v;
     }
@@ -199,8 +188,7 @@ public:
      *
      * @param[in] v F32 value.
      */
-    PixelValue(float v)
-        : PixelValue()
+    PixelValue(float v) : PixelValue()
     {
         value.f32 = v;
     }
@@ -208,8 +196,7 @@ public:
      *
      * @param[in] v F64 value.
      */
-    PixelValue(double v)
-        : PixelValue()
+    PixelValue(double v) : PixelValue()
     {
         value.f64 = v;
     }
@@ -217,23 +204,23 @@ public:
      * Use the field corresponding to the image format
      */
     union
-        {
-            uint64_t u64;     /**< Single channel U64 */
-            int64_t  s64;     /**< Single channel S64 */
-            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
-            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
-            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
-            double   f64;     /**< Single channel double */
-            float    f32;     /**< Single channel float 32 */
-            half     f16;     /**< Single channel F16 */
-            bfloat16 bf16;    /**< Single channel brain floating-point number */
-            uint8_t  u8;      /**< Single channel U8 */
-            int8_t   s8;      /**< Single channel S8 */
-            uint16_t u16;     /**< Single channel U16 */
-            int16_t  s16;     /**< Single channel S16 */
-            uint32_t u32;     /**< Single channel U32 */
-            int32_t  s32;     /**< Single channel S32 */
-        } value;
+    {
+        uint64_t u64;     /**< Single channel U64 */
+        int64_t  s64;     /**< Single channel S64 */
+        uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+        uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+        uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+        double   f64;     /**< Single channel double */
+        float    f32;     /**< Single channel float 32 */
+        half     f16;     /**< Single channel F16 */
+        bfloat16 bf16;    /**< Single channel brain floating-point number */
+        uint8_t  u8;      /**< Single channel U8 */
+        int8_t   s8;      /**< Single channel S8 */
+        uint16_t u16;     /**< Single channel U16 */
+        int16_t  s16;     /**< Single channel S16 */
+        uint32_t u32;     /**< Single channel U32 */
+        int32_t  s32;     /**< Single channel S32 */
+    } value;
     /** Interpret the pixel value as a U8
      *
      * @param[out] v Returned value
diff --git a/arm_compute/core/PyramidInfo.h b/arm_compute/core/PyramidInfo.h
deleted file mode 100644
index e8cbe3488a..0000000000
--- a/arm_compute/core/PyramidInfo.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_PYRAMIDINFO_H
-#define ARM_COMPUTE_PYRAMIDINFO_H
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Store the Pyramid's metadata */
-class PyramidInfo
-{
-public:
-    /** Default constructor */
-    PyramidInfo();
-    /** Default destructor */
-    virtual ~PyramidInfo() = default;
-    /** Allow instances of this class to be copy constructed */
-    PyramidInfo(const PyramidInfo &) = default;
-    /** Allow instances of this class to be copied */
-    PyramidInfo &operator=(const PyramidInfo &) = default;
-    /** Allow instances of this class to be move constructed */
-    PyramidInfo(PyramidInfo &&) = default;
-    /** Allow instances of this class to be moved */
-    PyramidInfo &operator=(PyramidInfo &&) = default;
-
-    /** Create pyramid info for 2D tensors
-     *
-     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale      Used to indicate the scale between the pyramid levels.
-     *                       This is required to be a non-zero positive value.
-     * @param[in] width      The width of the 2D tensor at 0th pyramid level
-     * @param[in] height     The height of the 2D tensor at 0th pyramid level
-     * @param[in] format     The format of all 2D tensors in the pyramid
-     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
-     */
-    PyramidInfo(size_t num_levels, float scale, size_t width, size_t height, Format format);
-
-    /** Create pyramid info using TensorShape
-     *
-     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale        Used to indicate the scale between the pyramid levels.
-     *                         This is required to be a non-zero positive value.
-     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
-     * @param[in] format       The format of all tensors in the pyramid
-     */
-    PyramidInfo(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
-
-    /** Initialize pyramid's metadata for 2D tensors
-     *
-     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale      Used to indicate the scale between the pyramid levels.
-     *                       This is required to be a non-zero positive value.
-     * @param[in] width      The width of the 2D tensor at 0th pyramid level
-     * @param[in] height     The height of the 2D tensor at 0th pyramid level
-     * @param[in] format     The format of all 2D tensors in the pyramid
-     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
-     */
-    void init(size_t num_levels, float scale, size_t width, size_t height, Format format);
-    /** Initialize pyramid's metadata using TensorShape
-     *
-     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale        Used to indicate the scale between the pyramid levels.
-     *                         This is required to be a non-zero positive value.
-     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
-     * @param[in] format       The format of all tensors in the pyramid
-     */
-    void init(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
-    /** Return the number of the pyramid levels
-     *
-     *  @return The number of the pyramid levels
-     */
-    size_t num_levels() const;
-    /** Return the width of the 0th level tensor
-     *
-     *  @return The width of the 0th level tensor
-     */
-    size_t width() const;
-    /** Return the height of the 0th level tensor
-     *
-     *  @return The height of the 0th level tensor
-     */
-    size_t height() const;
-    /** Return the TensorShape of the o-th level tensor
-     *
-     * @return
-     */
-    const TensorShape &tensor_shape() const;
-    /** Return the image format of all tensor in the pyramid
-     *
-     *  @return The image format
-     */
-    Format format() const;
-    /** Return the scale factor of the pyramid
-     *
-     *  @return Return the scale factor
-     */
-    float scale() const;
-
-private:
-    size_t      _num_levels;
-    TensorShape _tensor_shape;
-    Format      _format;
-    float       _scale;
-};
-}
-#endif /*ARM_COMPUTE_PYRAMIDINFO_H */
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 52ef149e9b..aecba3712e 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_QUANTIZATION_INFO_H
-#define ARM_COMPUTE_QUANTIZATION_INFO_H
+#ifndef ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
+#define ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
 
 #include "arm_compute/core/Rounding.h"
-#include "utils/misc/Utility.h"
-#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+
+#include "support/ToolchainSupport.h"
 
-#include <cstddef>
-#include <type_traits>
 #include <vector>
 
 namespace arm_compute
@@ -43,8 +42,7 @@ using qasymm16_t       = uint16_t; /**< 16 bit quantized asymmetric scalar value
 struct UniformQuantizationInfo
 {
     /** Default constructor */
-    UniformQuantizationInfo()
-        : scale(0.f), offset(0)
+    UniformQuantizationInfo() : scale(0.f), offset(0)
     {
     }
     /** Constructor
@@ -52,8 +50,7 @@ struct UniformQuantizationInfo
      * @param[in] scale  Quantization scale
      * @param[in] offset Quantization offset
      */
-    UniformQuantizationInfo(float scale, int32_t offset)
-        : scale(scale), offset(offset)
+    UniformQuantizationInfo(float scale, int32_t offset) : scale(scale), offset(offset)
     {
     }
     /** Checks if the scale and offset are both zero */
@@ -71,9 +68,7 @@ class QuantizationInfo
 {
 public:
     /** Default constructor */
-    QuantizationInfo() noexcept
-        : _scale(),
-          _offset()
+    QuantizationInfo() noexcept : _scale(), _offset()
     {
     }
     /** Construct quantization info.
@@ -82,19 +77,19 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(float scale)
-        : _scale(1, scale), _offset()
+    QuantizationInfo(float scale) : _scale(1, scale), _offset()
     {
     }
     /** Construct quantization info.
      *
      * @note Used for asymmetric quantization
      *
-     * @param[in] scale  Scale.
-     * @param[in] offset Offset.
+     * @param[in] scale      Scale.
+     * @param[in] offset     Offset.
+     * @param[in] is_dynamic Whether this QuantizationInfo is dynamic, i.e. the scale and offset may change.
      */
-    QuantizationInfo(float scale, int offset)
-        : _scale(1, scale), _offset(1, offset)
+    QuantizationInfo(float scale, int offset, bool is_dynamic = false)
+        : _scale(1, scale), _offset(1, offset), _is_dynamic(is_dynamic)
     {
     }
     /** Construct quantization info.
@@ -103,19 +98,19 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(std::vector<float> scale)
-        : _scale(scale), _offset()
+    QuantizationInfo(std::vector<float> scale) : _scale(scale), _offset()
     {
     }
     /** Construct quantization info.
      *
      * @note Used for asymmetric per channel quantization
      *
-     * @param[in] scale  Scale.
-     * @param[in] offset Offset.
+     * @param[in] scale      Scale.
+     * @param[in] offset     Offset.
+     * @param[in] is_dynamic Whether this QuantizationInfo is dynamic, i.e. the scale and offset may change.
      */
-    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset)
-        : _scale(scale), _offset(offset)
+    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset, bool is_dynamic = false)
+        : _scale(scale), _offset(offset), _is_dynamic(is_dynamic)
     {
     }
     /** Scale vector accessor
@@ -134,6 +129,14 @@ public:
     {
         return _offset;
     }
+    /** is_dynamic accessor
+     *
+     * @return If true, the scale and offset may change, so operators will need to read on every run
+     */
+    bool is_dynamic() const
+    {
+        return _is_dynamic;
+    }
     /** Indicates whether this QuantizationInfo has valid settings or not
      *
      * @return True if the this has invalid settings.
@@ -158,6 +161,8 @@ public:
 private:
     std::vector<float>   _scale;  /**< Vector containing scaling factors */
     std::vector<int32_t> _offset; /**< Vector containing zero offsets */
+    bool                 _is_dynamic =
+        false; /**< If true, the scale and offset may change, so operators will need to read on every run */
 };
 
 /** Check whether two quantization info are equal.
@@ -210,20 +215,39 @@ inline bool operator!=(const UniformQuantizationInfo &lhs, const UniformQuantiza
 template <typename QUANTIZED_TYPE = uint8_t>
 struct Qasymm8QuantizationHelper
 {
-    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value
-                  || std::is_same<QUANTIZED_TYPE, int8_t>::value,
+    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value || std::is_same<QUANTIZED_TYPE, int8_t>::value,
                   "quantized type should be either uint8_t or int8_t.");
 
     /** Quantize a value given a 8-bit asymmetric quantization scheme
      *
+     * @param[in] value Value to quantize
+     * @param[in] qinfo Quantization information to use for quantizing
+     *
+     * @return Quantized value
+     */
+    static inline QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo)
+    {
+        ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
+        const int quantized = support::cpp11::lround(value / qinfo.scale) + qinfo.offset;
+        return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
+    }
+
+    /** Quantize a value given a 8-bit asymmetric quantization scheme using a specific rounding policy
+     *
      * @param[in] value           Value to quantize
      * @param[in] qinfo           Quantization information to use for quantizing
-     * @param[in] rounding_policy (Optional) Rounding policy to use. Default: nearest up
+     * @param[in] rounding_policy Rounding policy to use
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy)
     {
+        if (rounding_policy == RoundingPolicy::TO_NEAREST_UP)
+        {
+            return quantize(value, qinfo);
+        }
+
         ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
         const int quantized = arm_compute::round(value / qinfo.scale, rounding_policy) + qinfo.offset;
         return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
@@ -237,7 +261,8 @@ struct Qasymm8QuantizationHelper
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
     {
         const UniformQuantizationInfo uqinfo = qinfo.uniform();
         ARM_COMPUTE_ERROR_ON(uqinfo.scale == 0);
@@ -280,7 +305,8 @@ struct Qasymm8QuantizationHelper
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint8_t
+quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<uint8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -294,7 +320,9 @@ inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPol
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int8_t quantize_qasymm8_signed(float            value,
+                                      const INFO_TYPE &qinfo,
+                                      RoundingPolicy   rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<int8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -416,6 +444,19 @@ inline float dequantize(uint16_t value, float scale, int32_t offset)
     return (static_cast<int>(value) - offset) * scale;
 }
 
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value  Value to dequantize
+ * @param[in] scale  Scale to use for dequantization
+ * @param[in] offset Zero-offset to use for dequantization
+ *
+ * @return Dequantized value
+ */
+inline float dequantize(int32_t value, float scale, int32_t offset)
+{
+    return (static_cast<int>(value) - offset) * scale;
+}
+
 /** Quantize a value given a 16-bit symmetric quantization scheme
  *
  * @param[in] value           Value to quantize
@@ -424,7 +465,9 @@ inline float dequantize(uint16_t value, float scale, int32_t offset)
  *
  * @return Quantized value
  */
-inline int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int16_t quantize_qsymm16(float                          value,
+                                const UniformQuantizationInfo &qinfo,
+                                RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy);
     quantized     = arm_compute::utility::clamp<int, int16_t>(quantized);
@@ -475,7 +518,9 @@ inline float dequantize_qsymm16(int16_t value, const QuantizationInfo &qinfo)
  *
  * @return Quantized value
  */
-inline uint16_t quantize_qasymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint16_t quantize_qasymm16(float                          value,
+                                  const UniformQuantizationInfo &qinfo,
+                                  RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy) + qinfo.offset;
     quantized     = arm_compute::utility::clamp<int, uint16_t>(quantized);
@@ -518,6 +563,31 @@ inline float dequantize_qasymm16(uint16_t value, const QuantizationInfo &qinfo)
     return dequantize_qasymm16(value, qinfo.uniform());
 }
 
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value Value to dequantize
+ * @param[in] qinfo Quantization information to use for dequantizing
+ *
+ * @return Dequantized value
+ */
+inline float dequantize_s32(int32_t value, const UniformQuantizationInfo &qinfo)
+{
+    return (static_cast<int>(value) - qinfo.offset) * qinfo.scale;
+}
+
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value Value to dequantize
+ * @param[in] qinfo Quantization information to use for dequantizing
+ *
+ * @return Dequantized value
+ */
+
+inline float dequantize_s32(int32_t value, const QuantizationInfo &qinfo)
+{
+    return dequantize_s32(value, qinfo.uniform());
+}
+
 /*
  * In case of requantization of a quantized input tensor to an output tensor with another quantization
  * instead of applying dequantization and then a quantization functions, we just compute new scale and
@@ -548,7 +618,8 @@ inline float dequantize_qasymm16(uint16_t value, const QuantizationInfo &qinfo)
  * z_n = - z_i * s_i / s_o + z_o
  *
  */
-inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in, const UniformQuantizationInfo &uqinfo_out)
+inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in,
+                                                                   const UniformQuantizationInfo &uqinfo_out)
 {
     float   scale_to_apply  = uqinfo_out.scale;
     int32_t offset_to_apply = uqinfo_out.offset;
@@ -562,4 +633,4 @@ inline UniformQuantizationInfo compute_requantization_scale_offset(const Uniform
 }
 
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_QUANTIZATION_INFO_H */
+#endif // ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
index 68d742907b..30a5a0fe9d 100644
--- a/arm_compute/core/Rounding.h
+++ b/arm_compute/core/Rounding.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,5 +42,5 @@ enum class RoundingPolicy
  * @return Rounded value of the argument x.
  */
 int round(float x, RoundingPolicy rounding_policy);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ROUNDING_H */
diff --git a/arm_compute/core/Size2D.h b/arm_compute/core/Size2D.h
index 722d7450f6..672b392050 100644
--- a/arm_compute/core/Size2D.h
+++ b/arm_compute/core/Size2D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,8 +41,7 @@ public:
      * @param[in] w Width of the image or rectangle
      * @param[in] h Height of the image or rectangle
      */
-    Size2D(size_t w, size_t h)
-        : width(w), height(h)
+    Size2D(size_t w, size_t h) noexcept : width(w), height(h)
     {
     }
     /** The area of the image or rectangle calculated as (width * height)
@@ -89,5 +88,5 @@ public:
     size_t width  = {}; /**< Width of the image region or rectangle */
     size_t height = {}; /**< Height of the image region or rectangle */
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_SIZE2D_H */
diff --git a/arm_compute/core/Size3D.h b/arm_compute/core/Size3D.h
new file mode 100644
index 0000000000..e2dc6fe012
--- /dev/null
+++ b/arm_compute/core/Size3D.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_SIZE3D_H
+#define ARM_COMPUTE_SIZE3D_H
+
+#include <string>
+
+namespace arm_compute
+{
+/** Class for specifying the size of a 3D shape or object */
+class Size3D
+{
+public:
+    /** Default constructor */
+    Size3D() = default;
+    /** Constructor. Initializes "width", "height" and "depth" respectively with "w", "h" and "d"
+     *
+     * @param[in] w Width of the 3D shape or object
+     * @param[in] h Height of the 3D shape or object
+     * @param[in] d Depth of the 3D shape or object
+     */
+    Size3D(size_t w, size_t h, size_t d) noexcept : width(w), height(h), depth(d)
+    {
+    }
+
+    /** Convert the values stored to string
+     *
+     * @return string of (width x height x depth).
+     */
+    std::string to_string() const;
+
+    /** Semantic accessor for width as x.
+     *
+     * @return x.
+     */
+    size_t x() const
+    {
+        return width;
+    }
+
+    /** Semantic accessor for height as y.
+     *
+     * @return y.
+     */
+    size_t y() const
+    {
+        return height;
+    }
+
+    /** Semantic accessor for depth as z.
+     *
+     * @return z.
+     */
+    size_t z() const
+    {
+        return depth;
+    }
+
+    bool operator!=(const Size3D &other) const
+    {
+        return !(*this == other);
+    }
+
+    bool operator==(const Size3D &other) const
+    {
+        return (width == other.width) && (height == other.height) && (depth == other.depth);
+    }
+
+public:
+    size_t width  = {}; /**< Width of the 3D shape or object */
+    size_t height = {}; /**< Height of the 3D shape or object */
+    size_t depth  = {}; /**< Depth of the 3D shape or object */
+};
+
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_SIZE3D_H */
diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
index 6c89185a1f..6b261becc0 100644
--- a/arm_compute/core/Steps.h
+++ b/arm_compute/core/Steps.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,8 +45,7 @@ public:
      * @param[in] steps Values to initialize the steps.
      */
     template <typename... Ts>
-    Steps(Ts... steps)
-        : Dimensions{ steps... }
+    Steps(Ts... steps) : Dimensions{steps...}
     {
         // Initialize empty dimensions to 1
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
@@ -62,5 +61,5 @@ public:
     /** Default destructor */
     ~Steps() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_STEPS_H*/
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
index a2a73377ea..627b219987 100644
--- a/arm_compute/core/Strides.h
+++ b/arm_compute/core/Strides.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include <algorithm>
 #include <array>
 #include <cstddef>
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -42,8 +43,7 @@ public:
      * @param[in] strides Values to initialize the strides.
      */
     template <typename... Ts>
-    constexpr Strides(Ts... strides)
-        : Dimensions{ strides... }
+    constexpr Strides(Ts... strides) : Dimensions{strides...}
     {
     }
     /** Allow instances of this class to be copy constructed */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index bcb570ae7f..7a3ee2cfd0 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,12 @@
 #ifndef ARM_COMPUTE_SUBTENSORINFO_H
 #define ARM_COMPUTE_SUBTENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Validate.h"
 
 #include <cstddef>
 #include <memory>
@@ -74,7 +72,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override
+    ITensorInfo                 &set_data_type(DataType data_type) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_data_type(data_type);
@@ -99,6 +97,7 @@ public:
         return *this;
     };
     ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
+    ITensorInfo &set_tensor_dims_state(const TensorDimsState &state) override;
     ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
@@ -116,7 +115,13 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->auto_padding();
     };
+
+    ITensorInfo &set_lock_paddings(bool flag) override;
+
+    bool lock_paddings() const override;
+
     bool extend_padding(const PaddingSize &padding) override;
+
     size_t dimension(size_t index) const override
     {
         return _tensor_shape[index];
@@ -137,7 +142,7 @@ public:
         return _parent->offset_element_in_bytes(_coords);
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->element_size();
@@ -156,6 +161,11 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _tensor_shape;
     }
+    const TensorDimsState &tensor_dims_state() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _dims_state;
+    }
     DataType data_type() const override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
@@ -191,16 +201,21 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->is_dynamic();
     }
+    bool are_values_constant() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->are_values_constant();
+    }
     ITensorInfo &set_is_resizable(bool is_resizable) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_is_resizable(is_resizable);
         return *this;
     }
-    ITensorInfo &set_is_dynamic(bool is_dynamic) override
+    ITensorInfo &set_are_values_constant(bool are_values_constant) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
-        _parent->set_is_dynamic(is_dynamic);
+        _parent->set_are_values_constant(are_values_constant);
         return *this;
     }
     ValidRegion valid_region() const override
@@ -211,7 +226,7 @@ public:
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         // Check if subtensor is valid if parent is configured
-        if(_parent->tensor_shape().total_size() != 0)
+        if (_parent->tensor_shape().total_size() != 0)
         {
             ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
         }
@@ -227,13 +242,26 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->data_layout();
     }
+    ITensorInfo::Id id() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->id();
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_id(id);
+        return *this;
+    }
 
 private:
-    ITensorInfo *_parent;
-    TensorShape  _tensor_shape;
-    Coordinates  _coords;
-    ValidRegion  _valid_region;
-    bool         _extend_parent;
+    ITensorInfo    *_parent;
+    TensorShape     _tensor_shape;
+    TensorDimsState _dims_state;
+    Coordinates     _coords;
+    ValidRegion     _valid_region;
+    bool            _extend_parent;
+    bool            _lock_paddings;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_SUBTENSORINFO_H */
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 68570d58db..b18f750427 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,19 @@
 #ifndef ARM_COMPUTE_TENSORINFO_H
 #define ARM_COMPUTE_TENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "ITensorInfo.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
 
+#include "ITensorInfo.h"
 #include <cstddef>
 #include <memory>
 
 namespace arm_compute
 {
-class HOGInfo;
-
 /** Store the tensor's metadata */
 class TensorInfo final : public ITensorInfo
 {
@@ -52,7 +48,7 @@ public:
     /** Allow instances of this class to be copy constructed */
     TensorInfo(const ITensorInfo &info);
     /** Allow instances of this class to be copy constructed */
-    TensorInfo(const TensorInfo &) = default;
+    TensorInfo(const TensorInfo &);
     /** Allow instances of this class to be copied */
     TensorInfo &operator=(const TensorInfo &) = default;
     /** Allow instances of this class to be move constructed */
@@ -115,15 +111,10 @@ public:
      * @param[in] data_type         Data type to use for each tensor element
      * @param[in] quantization_info The quantization settings for the tensor data.
      */
-    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info);
-
-    /** Constructor
-     *
-     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
-     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
-     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
-     */
-    TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+    TensorInfo(const TensorShape &tensor_shape,
+               size_t             num_channels,
+               DataType           data_type,
+               QuantizationInfo   quantization_info);
 
     /** Initialize the tensor info with just a format.
      *
@@ -147,7 +138,11 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, Format format, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, size_t total_size_in_bytes);
+    void init(const TensorShape &tensor_shape,
+              Format             format,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
 
     /** Initialize the tensor info with just a format.
      *
@@ -175,15 +170,12 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-              size_t total_size_in_bytes);
-    /** Initialize the metadata structure for the given HOG's metadata
-     *
-     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
-     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
-     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
-     */
-    void init(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+    void init(const TensorShape &tensor_shape,
+              size_t             num_channels,
+              DataType           data_type,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
     /** Initialize the metadata structure for the given tensor shape and single-plane format, (Padding is automatically calculated)
      *
      * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
@@ -206,30 +198,22 @@ public:
      * @return Total allocation size including padding in bytes.
      */
     size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type);
-    /** Initialize the metadata structure for the given HOG's metadata
-     *
-     * @note init_auto_padding will be used for the tensor initialization.
-     *
-     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
-     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
-     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
-     *
-     * @return Total allocation size including padding in bytes.
-     */
-    size_t init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height);
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override;
-    ITensorInfo &set_num_channels(int num_channels) override;
-    ITensorInfo &set_format(Format format) override;
-    ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
-    ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override;
-    ITensorInfo &set_data_layout(const DataLayout &data_layout) override;
-    ITensorInfo &reset_padding() override;
-    bool         auto_padding() override;
-    bool extend_padding(const PaddingSize &padding) override;
-    size_t dimension(size_t index) const override
+    ITensorInfo                 &set_data_type(DataType data_type) override;
+    ITensorInfo                 &set_num_channels(int num_channels) override;
+    ITensorInfo                 &set_format(Format format) override;
+    ITensorInfo                 &set_tensor_shape(const TensorShape &shape) override;
+    ITensorInfo                 &set_tensor_dims_state(const TensorDimsState &state) override;
+    ITensorInfo                 &set_quantization_info(const QuantizationInfo &quantization_info) override;
+    ITensorInfo                 &set_data_layout(const DataLayout &data_layout) override;
+    ITensorInfo                 &reset_padding() override;
+    bool                         auto_padding() override;
+    ITensorInfo                 &set_lock_paddings(bool flag) override;
+    bool                         lock_paddings() const override;
+    bool                         extend_padding(const PaddingSize &padding) override;
+    size_t                       dimension(size_t index) const override
     {
         return _tensor_shape[index];
     }
@@ -246,7 +230,7 @@ public:
         return _offset_first_element_in_bytes;
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         return data_size_from_type(_data_type) * _num_channels;
     }
@@ -262,6 +246,10 @@ public:
     {
         return _tensor_shape;
     }
+    const TensorDimsState &tensor_dims_state() const override
+    {
+        return _dims_state;
+    }
     DataType data_type() const override
     {
         return _data_type;
@@ -288,16 +276,16 @@ public:
     }
     bool is_dynamic() const override
     {
-        return _is_dynamic;
+        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), get_dynamic_state_value()) !=
+               std::cend(_dims_state);
     }
-    ITensorInfo &set_is_resizable(bool is_resizable) override
+    bool are_values_constant() const override
     {
-        _is_resizable = is_resizable;
-        return *this;
+        return _are_values_constant;
     }
-    ITensorInfo &set_is_dynamic(bool is_dynamic) override
+    ITensorInfo &set_is_resizable(bool is_resizable) override
     {
-        _is_dynamic = is_dynamic;
+        _is_resizable = is_resizable;
         return *this;
     }
     ValidRegion valid_region() const override
@@ -316,6 +304,21 @@ public:
     {
         return _data_layout;
     }
+    ITensorInfo &set_are_values_constant(bool are_values_constant) override
+    {
+        _are_values_constant = are_values_constant;
+        return *this;
+    }
+    ITensorInfo::Id id() const override
+    {
+        return _id;
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        _id = id;
+        return *this;
+    }
+    inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs);
 
 private:
     /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
@@ -329,14 +332,37 @@ private:
     Strides          _strides_in_bytes;
     size_t           _num_channels;
     TensorShape      _tensor_shape;
+    TensorDimsState  _dims_state;
     DataType         _data_type;
     Format           _format;
     bool             _is_resizable;
-    bool             _is_dynamic;
     ValidRegion      _valid_region;
     PaddingSize      _padding;
     QuantizationInfo _quantization_info;
     DataLayout       _data_layout;
+    bool             _are_values_constant;
+    ITensorInfo::Id  _id;
+    bool             _lock_paddings;
 };
+
+/** Check whether two tensor info are equal.
+ *
+ * @param[in] lhs LHS tensor info.
+ * @param[in] rhs RHS tensor info.
+ *
+ * @return True if the given tensor infos are the same.
+ */
+inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
+{
+    return (lhs._total_size == rhs._total_size) &&
+           (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) &&
+           (lhs._strides_in_bytes == rhs._strides_in_bytes) && (lhs._num_channels == rhs._num_channels) &&
+           (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) &&
+           (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format) &&
+           (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) &&
+           (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info) &&
+           (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant) &&
+           (lhs._id == rhs._id);
+}
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 57d8f6cf63..c1707e262f 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@
 namespace arm_compute
 {
 /** Shape of a tensor */
-class TensorShape : public Dimensions<uint32_t>
+class TensorShape : public Dimensions<size_t>
 {
 public:
     /** Constructor to initialize the tensor shape.
@@ -44,11 +44,10 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    TensorShape(Ts... dims)
-        : Dimensions{ dims... }
+    TensorShape(Ts... dims) : Dimensions{dims...}
     {
         // Initialize unspecified dimensions to 1
-        if(_num_dimensions > 0)
+        if (_num_dimensions > 0)
         {
             std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
         }
@@ -71,14 +70,15 @@ public:
      *
      * @param[in] dimension            Dimension for which the value is set.
      * @param[in] value                Value to be set for the dimension.
-     * @param[in] apply_dim_correction Flag to state whether apply dimension correction after setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but _num_dimensions should be 3 rather than 1.
+     * @param[in] apply_dim_correction (Optional) Flag to state whether apply dimension correction after setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but _num_dimensions should be 3 rather than 1.
+     * @param[in] increase_dim_unit    (Optional) Set to true if new unit dimensions increase the number of dimensions of the shape.
      *
      * @return *this.
      */
-    TensorShape &set(size_t dimension, size_t value, bool apply_dim_correction = true)
+    TensorShape &set(size_t dimension, size_t value, bool apply_dim_correction = true, bool increase_dim_unit = true)
     {
         // Clear entire shape if one dimension is zero
-        if(value == 0)
+        if (value == 0)
         {
             _num_dimensions = 0;
             std::fill(_id.begin(), _id.end(), 0);
@@ -90,10 +90,10 @@ public:
 
             // Set the specified dimension and increase the number of dimensions if
             // necessary
-            Dimensions::set(dimension, value);
+            Dimensions::set(dimension, value, increase_dim_unit);
 
             // Correct number dimensions to ignore trailing dimensions of size 1
-            if(apply_dim_correction)
+            if (apply_dim_correction)
             {
                 apply_dimension_correction();
             }
@@ -105,9 +105,10 @@ public:
      *
      * @note The upper dimensions of the tensor shape will be shifted down by 1
      *
-     * @param[in] n Dimension to remove
+     * @param[in] n                    Dimension to remove
+     * @param[in] apply_dim_correction (Optional) Flag to state whether apply dimension correction (removing trailing dimensions with size of 1) after removing a dimension.
      */
-    void remove_dimension(size_t n)
+    void remove_dimension(size_t n, bool apply_dim_correction = true)
     {
         ARM_COMPUTE_ERROR_ON(_num_dimensions < 1);
         ARM_COMPUTE_ERROR_ON(n >= _num_dimensions);
@@ -121,7 +122,10 @@ public:
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
 
         // Correct number dimensions to ignore trailing dimensions of size 1
-        apply_dimension_correction();
+        if (apply_dim_correction)
+        {
+            apply_dimension_correction();
+        }
     }
 
     /** Collapse the first n dimensions.
@@ -207,26 +211,26 @@ public:
      * @return The broadcasted shape or an empty shape if the shapes are not broadcast compatible.
      */
     template <typename... Shapes>
-    static TensorShape broadcast_shape(const Shapes &... shapes)
+    static TensorShape broadcast_shape(const Shapes &...shapes)
     {
         TensorShape bc_shape;
 
-        auto broadcast = [&bc_shape](const TensorShape & other)
+        auto broadcast = [&bc_shape](const TensorShape &other)
         {
-            if(bc_shape.num_dimensions() == 0)
+            if (bc_shape.num_dimensions() == 0)
             {
                 bc_shape = other;
             }
-            else if(other.num_dimensions() != 0)
+            else if (other.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+                for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
                 {
                     const size_t dim_min = std::min(bc_shape[d], other[d]);
                     const size_t dim_max = std::max(bc_shape[d], other[d]);
 
-                    if((dim_min != 1) && (dim_min != dim_max))
+                    if ((dim_min != 1) && (dim_min != dim_max))
                     {
-                        bc_shape = TensorShape{ 0U };
+                        bc_shape = TensorShape{0U};
                         break;
                     }
 
@@ -244,9 +248,9 @@ private:
     /** Remove trailing dimensions of size 1 from the reported number of dimensions. */
     void apply_dimension_correction()
     {
-        for(int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
+        for (int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
         {
-            if(_id[i] == 1)
+            if (_id[i] == 1)
             {
                 --_num_dimensions;
             }
@@ -257,5 +261,5 @@ private:
         }
     }
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORSHAPE_H*/
diff --git a/arm_compute/core/TracePoint.h b/arm_compute/core/TracePoint.h
deleted file mode 100644
index 6951d6d5ef..0000000000
--- a/arm_compute/core/TracePoint.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TRACEPOINT_H
-#define ARM_COMPUTE_TRACEPOINT_H
-
-#include <string>
-#include <type_traits>
-#include <vector>
-
-namespace arm_compute
-{
-#ifdef ARM_COMPUTE_TRACING_ENABLED
-#define CREATE_TRACEPOINT(...) TracePoint __tp(__VA_ARGS__)
-
-/** Class used to dump configuration values in functions and kernels  */
-class TracePoint final
-{
-public:
-    /** Layer types */
-    enum class Layer
-    {
-        CORE,
-        RUNTIME
-    };
-    /** struct describing the arguments for a tracepoint */
-    struct Args final
-    {
-        std::vector<std::string> args{};
-    };
-    /** Constructor
-     *
-     * @param[in] source     type of layer for the tracepoint
-     * @param[in] class_name the name of the class creating the tracepoint
-     * @param[in] object     a pointer to the actual object owning the tracepoint
-     * @param[in] args       a struct describing all the arguments used in the call to the configure() method
-     *
-     */
-    TracePoint(Layer source, const std::string &class_name, void *object, Args &&args);
-    /** Destructor */
-    ~TracePoint();
-
-private:
-    static int g_depth; /**< current depth */
-    int        _depth;  /**< tracepoint depth */
-};
-
-/** Operator to write an argument to a @ref TracePoint
- *
- * @param[in] tp  Tracepoint to be used for writing
- * @param[in] arg Argument to be written in the tracepoint
- *
- * @return A referece to the updated tracepoint
- */
-template <typename T>
-TracePoint::Args &&operator<<(typename std::enable_if < !std::is_pointer<T>::value, TracePoint::Args >::type &&tp, const T &arg);
-template <typename T>
-TracePoint::Args &&operator<<(TracePoint::Args &&tp, const T *arg);
-
-#define CONST_REF_CLASS(type)                                             \
-    template <>                                                           \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type &arg) \
-    {                                                                     \
-        ARM_COMPUTE_UNUSED(tp);                                           \
-        tp.args.push_back(#type "(" + to_string(arg) + ")");              \
-        return std::move(tp);                                             \
-    }
-
-#define CONST_PTR_ADDRESS(type)                                           \
-    template <>                                                           \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type *arg) \
-    {                                                                     \
-        ARM_COMPUTE_UNUSED(tp);                                           \
-        tp.args.push_back(#type "*(" + to_ptr_string(arg) + ")");         \
-        return std::move(tp);                                             \
-    }
-#define CONST_PTR_CLASS(type)                                             \
-    template <>                                                           \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type *arg) \
-    {                                                                     \
-        ARM_COMPUTE_UNUSED(tp);                                           \
-        if(arg)                                                           \
-            tp.args.push_back(#type "(" + to_string(*arg) + ")");         \
-        else                                                              \
-            tp.args.push_back(#type "( nullptr )");                       \
-        return std::move(tp);                                             \
-    }
-
-#define CONST_REF_SIMPLE(type)                                               \
-    template <>                                                              \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type &arg)    \
-    {                                                                        \
-        ARM_COMPUTE_UNUSED(tp);                                              \
-        tp.args.push_back(#type "(" + support::cpp11::to_string(arg) + ")"); \
-        return std::move(tp);                                                \
-    }
-
-#define TRACE_TO_STRING(type)              \
-    std::string to_string(const type &arg) \
-    {                                      \
-        ARM_COMPUTE_UNUSED(arg);           \
-        return "";                         \
-    }
-#else /* ARM_COMPUTE_TRACING_ENABLED */
-#define CREATE_TRACEPOINT(...)
-#define CONST_REF_CLASS(type)
-#define CONST_PTR_ADDRESS(type)
-#define CONST_PTR_CLASS(type)
-#define CONST_REF_SIMPLE(type)
-#define TRACE_TO_STRING(type)
-#endif /* ARM_COMPUTE_TRACING_ENABLED */
-} //namespace arm_compute
-
-#endif /* ARM_COMPUTE_TRACEPOINT_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 4e73edba4b..f2f60c150e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,52 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TYPES_H
-#define ARM_COMPUTE_TYPES_H
-
+#ifndef ACL_ARM_COMPUTE_CORE_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_TYPES_H
+
+/** The following symbols have been moved to:
+ * half
+ * PermutationVector
+ * Format
+ * DataType
+ * DataLayout
+ * DataLayoutDimension
+ * PadStrideInfo
+ * WeightFormat
+ * Channel
+ * DimensionRoundingType
+ */
+#include "arm_compute/core/CoreTypes.h"
+/** The following symbols have been moved to:
+ * ActivationFunction
+ * ActivationLayerInfo
+ */
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+/** The following symbols have been moved to:
+ * ConvolutionInfo
+ */
+#include "arm_compute/function_info/ConvolutionInfo.h"
+/** The following symbols have been moved to:
+ * FullyConnectedLayerInfo
+ */
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+/** The following symbols have been moved to:
+ * GEMMLowpOutputStageType
+ * GEMMLowpOutputStageInfo
+ * GEMMInfo
+ */
+#include "arm_compute/function_info/GEMMInfo.h"
+/** The following symbols have been moved to:
+ * MatMulInfo
+ */
 #include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/Size3D.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
 #include "support/Bfloat16.h"
-#include "support/Half.h"
 
 #include <cmath>
 #include <cstddef>
@@ -42,62 +77,9 @@
 
 namespace arm_compute
 {
-/** 16-bit floating point type */
-using half = half_float::half;
-
-/** Permutation vector */
-using PermutationVector = Strides;
 /** Bidirectional strides */
 using BiStrides = Coordinates;
 
-/** Image colour formats */
-enum class Format
-{
-    UNKNOWN,  /**< Unknown image format */
-    U8,       /**< 1 channel, 1 U8 per channel */
-    S16,      /**< 1 channel, 1 S16 per channel */
-    U16,      /**< 1 channel, 1 U16 per channel */
-    S32,      /**< 1 channel, 1 S32 per channel */
-    U32,      /**< 1 channel, 1 U32 per channel */
-    BFLOAT16, /**< 16-bit brain floating-point number */
-    F16,      /**< 1 channel, 1 F16 per channel */
-    F32,      /**< 1 channel, 1 F32 per channel */
-    UV88,     /**< 2 channel, 1 U8 per channel */
-    RGB888,   /**< 3 channels, 1 U8 per channel */
-    RGBA8888, /**< 4 channels, 1 U8 per channel */
-    YUV444,   /**< A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
-    YUYV422,  /**< A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
-    NV12,     /**< A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
-    NV21,     /**< A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
-    IYUV,     /**< A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
-    UYVY422   /**< A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
-};
-
-/** Available data types */
-enum class DataType
-{
-    UNKNOWN,            /**< Unknown data type */
-    U8,                 /**< unsigned 8-bit number */
-    S8,                 /**< signed 8-bit number */
-    QSYMM8,             /**< quantized, symmetric fixed-point 8-bit number */
-    QASYMM8,            /**< quantized, asymmetric fixed-point 8-bit number unsigned */
-    QASYMM8_SIGNED,     /**< quantized, asymmetric fixed-point 8-bit number signed */
-    QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */
-    U16,                /**< unsigned 16-bit number */
-    S16,                /**< signed 16-bit number */
-    QSYMM16,            /**< quantized, symmetric fixed-point 16-bit number */
-    QASYMM16,           /**< quantized, asymmetric fixed-point 16-bit number */
-    U32,                /**< unsigned 32-bit number */
-    S32,                /**< signed 32-bit number */
-    U64,                /**< unsigned 64-bit number */
-    S64,                /**< signed 64-bit number */
-    BFLOAT16,           /**< 16-bit brain floating-point number */
-    F16,                /**< 16-bit floating-point number */
-    F32,                /**< 32-bit floating-point number */
-    F64,                /**< 64-bit floating-point number */
-    SIZET               /**< size_t */
-};
-
 /** Available Sampling Policies */
 enum class SamplingPolicy
 {
@@ -105,42 +87,15 @@ enum class SamplingPolicy
     TOP_LEFT /**< Samples are taken at pixel top left corner */
 };
 
-/** Constant value of the border pixels when using BorderMode::CONSTANT */
-constexpr uint8_t CONSTANT_BORDER_VALUE = 199;
-
-/** Constant value used to indicate a half-scale pyramid */
-constexpr float SCALE_PYRAMID_HALF = 0.5f;
-
-/** Constant value used to indicate a ORB scaled pyramid */
-constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01;
-
-/** [DataLayout enum definition] **/
-
-/** Supported tensor data layouts */
-enum class DataLayout
-{
-    UNKNOWN, /**< Unknown data layout */
-    NCHW,    /**< Num samples, channels, height, width */
-    NHWC     /**< Num samples, height, width, channels */
-};
-/** [DataLayout enum definition] **/
-
-/** Supported tensor data layout dimensions */
-enum class DataLayoutDimension
-{
-    CHANNEL, /**< channel */
-    HEIGHT,  /**< height */
-    WIDTH,   /**< width */
-    BATCHES  /**< batches */
-};
-
 /** Available ConvolutionMethod*/
 enum class ConvolutionMethod
 {
-    GEMM,     /**< Convolution using GEMM */
-    DIRECT,   /**< Direct convolution */
-    WINOGRAD, /**< Convolution using Winograd */
-    FFT       /**< Convolution using FFT */
+    GEMM,        /**< Convolution using GEMM */
+    GEMM_CONV2D, /**< Direct 2D GEMM convolution */
+    DIRECT,      /**< Direct convolution */
+    INDIRECT,    /**< Indirect convolution */
+    WINOGRAD,    /**< Convolution using Winograd */
+    FFT          /**< Convolution using FFT */
 };
 
 /** Available DepthwiseConvolutionFunction*/
@@ -153,8 +108,9 @@ enum class DepthwiseConvolutionFunction
 /** Available DeconvolutionMethod*/
 enum class DeconvolutionMethod
 {
-    GEMM,   /**< Deconvolution using GEMM */
-    DIRECT, /**< Direct deconvolution */
+    GEMM,          /**< Deconvolution using GEMM */
+    DIRECT,        /**< Direct deconvolution */
+    UPSCALE_CONV2D /**< Deconvolution with Upscaling */
 };
 
 /** Available FuseBatchNormalizationType*/
@@ -187,8 +143,7 @@ enum class ComparisonOperation
 struct ValidRegion
 {
     /** Default constructor */
-    ValidRegion()
-        : anchor{}, shape{}
+    ValidRegion() : anchor{}, shape{}
     {
     }
 
@@ -209,8 +164,7 @@ struct ValidRegion
      * @param[in] a_shape   Shape of the valid region.
      *
      */
-    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape)
-        : anchor{ an_anchor }, shape{ a_shape }
+    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape) : anchor{an_anchor}, shape{a_shape}
     {
         anchor.set_num_dimensions(std::max(anchor.num_dimensions(), shape.num_dimensions()));
     }
@@ -223,7 +177,7 @@ struct ValidRegion
      *
      */
     ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape, size_t num_dimensions)
-        : anchor{ an_anchor }, shape{ a_shape }
+        : anchor{an_anchor}, shape{a_shape}
     {
         ARM_COMPUTE_ERROR_ON(num_dimensions < std::max(anchor.num_dimensions(), shape.num_dimensions()));
         anchor.set_num_dimensions(num_dimensions);
@@ -256,9 +210,22 @@ struct ValidRegion
         return *this;
     }
 
+    /** Check whether two valid regions are equal.
+     *
+     * @param[in] lhs LHS valid region
+     * @param[in] rhs RHS valid region
+     *
+     * @return True if the valid regions are the same.
+     */
+    inline friend bool operator==(const ValidRegion &lhs, const ValidRegion &rhs);
+
     Coordinates anchor; /**< Anchor for the start of the valid region. */
     TensorShape shape;  /**< Shape of the valid region. */
 };
+inline bool operator==(const ValidRegion &lhs, const ValidRegion &rhs)
+{
+    return (lhs.anchor == rhs.anchor) && (lhs.shape == rhs.shape);
+}
 
 /** Methods available to handle borders */
 enum class BorderMode
@@ -272,26 +239,24 @@ enum class BorderMode
 struct BorderSize
 {
     /** Empty border, i.e. no border */
-    constexpr BorderSize()
-        : top{ 0 }, right{ 0 }, bottom{ 0 }, left{ 0 }
+    constexpr BorderSize() noexcept : top{0}, right{0}, bottom{0}, left{0}
     {
     }
 
     /** Border with equal size around the 2D plane */
-    explicit constexpr BorderSize(unsigned int size)
-        : top{ size }, right{ size }, bottom{ size }, left{ size }
+    explicit constexpr BorderSize(unsigned int size) noexcept : top{size}, right{size}, bottom{size}, left{size}
     {
     }
 
     /** Border with same size for top/bottom and left/right */
     constexpr BorderSize(unsigned int top_bottom, unsigned int left_right)
-        : top{ top_bottom }, right{ left_right }, bottom{ top_bottom }, left{ left_right }
+        : top{top_bottom}, right{left_right}, bottom{top_bottom}, left{left_right}
     {
     }
 
     /** Border with different sizes */
     constexpr BorderSize(unsigned int top, unsigned int right, unsigned int bottom, unsigned int left)
-        : top{ top }, right{ right }, bottom{ bottom }, left{ left }
+        : top{top}, right{right}, bottom{bottom}, left{left}
     {
     }
 
@@ -337,6 +302,28 @@ struct BorderSize
         return size;
     }
 
+    /** Check equality with another BorderSize struct
+     *
+     * @param[in] rhs other struct to check against
+     *
+     * @return true if they are equal
+     */
+    bool operator==(const BorderSize &rhs) const
+    {
+        return (top == rhs.top) && (right == rhs.right) && (bottom == rhs.bottom) && (left == rhs.left);
+    }
+
+    /** Check non-equality with another BorderSize struct
+     *
+     * @param[in] rhs other struct to check against
+     *
+     * @return true if they are different
+     */
+    bool operator!=(const BorderSize &rhs) const
+    {
+        return !(*this == rhs);
+    }
+
     /** Limit this border size.
      *
      * @param[in] limit Border size to limit this border size to.
@@ -358,7 +345,11 @@ struct BorderSize
 /** Container for 2D padding size */
 using PaddingSize = BorderSize;
 
-/** Policy to handle overflow */
+/** Policy to handle integer overflow
+ *  @note: This is ignored by floating point operations where the overflow behavior adheres to the IEEE-754 standard
+ *         which states that in case of overflow ±infinity is returned for the round-to-nearest modes (and follows the
+ *         rounding rules for the directed rounding modes) by default.
+ */
 enum class ConvertPolicy
 {
     WRAP,    /**< Wrap around */
@@ -370,7 +361,7 @@ enum class InterpolationPolicy
 {
     NEAREST_NEIGHBOR, /**< Output values are defined to match the source pixel whose center is nearest to the sample position */
     BILINEAR,         /**< Output values are defined by bilinear interpolation between the pixels */
-    AREA,             /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
+    AREA, /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
 };
 
 /** Bilinear Interpolation method used by LKTracker */
@@ -380,53 +371,6 @@ enum class BilinearInterpolation
     BILINEAR_SCHARR   /**< Scharr method */
 };
 
-/** Threshold mode */
-enum class ThresholdType
-{
-    BINARY, /**< Threshold with one value */
-    RANGE   /**< Threshold with two values*/
-};
-
-/** Termination criteria */
-enum class Termination
-{
-    TERM_CRITERIA_EPSILON,    /**< Terminate when within epsilon of a threshold */
-    TERM_CRITERIA_ITERATIONS, /**< Terminate after a maximum number of iterations */
-    TERM_CRITERIA_BOTH        /**< Terminate on whichever of the other conditions occurs first */
-};
-
-/** Magnitude calculation type. */
-enum class MagnitudeType
-{
-    L1NORM, /**< L1 normalization type */
-    L2NORM  /**< L2 normalization type */
-};
-
-/** Phase calculation type.
- *
- * @note When PhaseType == SIGNED, each angle is mapped to the range 0 to 255 inclusive otherwise angles between 0 and 180
- */
-enum class PhaseType
-{
-    SIGNED,  /**< Angle range: [0, 360] */
-    UNSIGNED /**< Angle range: [0, 180] */
-};
-
-/** Keypoint type */
-struct KeyPoint
-{
-    int32_t x{ 0 };               /**< X coordinates */
-    int32_t y{ 0 };               /**< Y coordinates */
-    float   strength{ 0.f };      /**< Strength of the point */
-    float   scale{ 0.f };         /**< Scale initialized to 0 by the corner detector */
-    float   orientation{ 0.f };   /**< Orientation initialized to 0 by the corner detector */
-    int32_t tracking_status{ 0 }; /**< Status initialized to 1 by the corner detector, set to 0 when the point is lost */
-    float   error{ 0.f };         /**< Tracking error initialized to 0 by the corner detector */
-};
-
-/** Internal key point */
-using InternalKeypoint = std::tuple<float, float, float>; /* x,y,strength */
-
 /** Rectangle type */
 struct Rectangle
 {
@@ -460,40 +404,6 @@ using PaddingList = std::vector<PaddingInfo>;
 /** Information to produce a tiled version of a Tensor */
 using Multiples = std::vector<uint32_t>;
 
-/** Available channels */
-enum class Channel
-{
-    UNKNOWN, /** Unknown channel format */
-    C0,      /**< First channel (used by formats with unknown channel types). */
-    C1,      /**< Second channel (used by formats with unknown channel types). */
-    C2,      /**< Third channel (used by formats with unknown channel types). */
-    C3,      /**< Fourth channel (used by formats with unknown channel types). */
-    R,       /**< Red channel. */
-    G,       /**< Green channel. */
-    B,       /**< Blue channel. */
-    A,       /**< Alpha channel. */
-    Y,       /**< Luma channel. */
-    U,       /**< Cb/U channel. */
-    V        /**< Cr/V/Value channel. */
-};
-
-/** Available matrix patterns */
-enum class MatrixPattern
-{
-    BOX,   /**< Box pattern matrix. */
-    CROSS, /**< Cross pattern matrix. */
-    DISK,  /**< Disk pattern matrix. */
-    OTHER  /**< Any other matrix pattern. */
-};
-
-/** Available non linear functions. */
-enum class NonLinearFilterFunction : unsigned
-{
-    MEDIAN = 0, /**< Non linear median filter. */
-    MIN    = 1, /**< Non linear erode. */
-    MAX    = 2, /**< Non linear dilate. */
-};
-
 /** Available reduction operations */
 enum class ReductionOperation
 {
@@ -523,13 +433,23 @@ enum class ArithmeticOperation
 /** Available element wise unary operations */
 enum class ElementWiseUnary
 {
-    RSQRT, /**< Reverse square root */
-    EXP,   /**< Exponential */
-    NEG,   /**< Negate */
-    LOG,   /**< Natural Logarithm */
-    ABS,   /**< Absolute value */
-    SIN,   /**< Sine */
-    ROUND, /**< Round */
+    RSQRT,       /**< Reverse square root */
+    EXP,         /**< Exponential */
+    NEG,         /**< Negate */
+    LOG,         /**< Natural Logarithm */
+    ABS,         /**< Absolute value */
+    SIN,         /**< Sine */
+    ROUND,       /**< Round */
+    LOGICAL_NOT, /**< Logical Not */
+};
+
+/** Available bitwise operations */
+enum class BitwiseOperation
+{
+    AND, /**< Bitwise AND operation */
+    NOT, /**< Bitwise NOT operation */
+    OR,  /**< Bitwise OR operation  */
+    XOR, /**< Bitwise XOR operation  */
 };
 
 /** The normalization type used for the normalization layer */
@@ -540,14 +460,6 @@ enum class NormType
     CROSS_MAP  /**< Normalization applied cross maps */
 };
 
-/** Normalization type for Histogram of Oriented Gradients (HOG) */
-enum class HOGNormType
-{
-    L2_NORM    = 1, /**< L2-norm */
-    L2HYS_NORM = 2, /**< L2-norm followed by clipping */
-    L1_NORM    = 3  /**< L1 norm */
-};
-
 /** Detection window used for the object detection. The detection window keeps the following information:
  *
  *  -# Geometry of the rectangular window (x/y of top-left corner and width/height)
@@ -556,21 +468,12 @@ enum class HOGNormType
  */
 struct DetectionWindow
 {
-    uint16_t x{ 0 };         /**< Top-left x coordinate */
-    uint16_t y{ 0 };         /**< Top-left y coordinate */
-    uint16_t width{ 0 };     /**< Width of the detection window */
-    uint16_t height{ 0 };    /**< Height of the detection window */
-    uint16_t idx_class{ 0 }; /**< Index of the class */
-    float    score{ 0.f };   /**< Confidence value for the detection window */
-};
-
-/** Dimension rounding type when down-scaling on CNNs
- * @note Used in pooling and convolution layer
- */
-enum class DimensionRoundingType
-{
-    FLOOR, /**< Floor rounding */
-    CEIL   /**< Ceil rounding */
+    uint16_t x{0};         /**< Top-left x coordinate */
+    uint16_t y{0};         /**< Top-left y coordinate */
+    uint16_t width{0};     /**< Width of the detection window */
+    uint16_t height{0};    /**< Height of the detection window */
+    uint16_t idx_class{0}; /**< Index of the class */
+    float    score{0.f};   /**< Confidence value for the detection window */
 };
 
 /** Available pooling types */
@@ -607,12 +510,28 @@ public:
      * @param[in] im_width                 (Optional) Boxes whose centers (on the x axis) is beyond im_width will be filtered. Defaults to 1
      * @param[in] im_height                (Optional) Boxes whose centers (on the y axis) is beyond im_height will be filtered. Defaults to 1
      */
-    BoxNMSLimitInfo(float score_thresh = 0.05f, float nms = 0.3f,
-                    int detections = 100, bool soft_nms_enabled = false,
-                    NMSType soft_nms_method = NMSType::LINEAR,
-                    float soft_nms_sigma = 0.5f, float soft_nms_min_score_thres = 0.001f, bool suppress_size = false, float min_size = 1.0f, float im_width = 1.0f, float im_height = 1.0f)
-        : _score_thresh(score_thresh), _nms(nms), _detections_per_im(detections), _soft_nms_enabled(soft_nms_enabled), _soft_nms_method(soft_nms_method), _soft_nms_sigma(soft_nms_sigma),
-          _soft_nms_min_score_thres(soft_nms_min_score_thres), _suppress_size(suppress_size), _min_size(min_size), _im_width(im_width), _im_height(im_height)
+    BoxNMSLimitInfo(float   score_thresh             = 0.05f,
+                    float   nms                      = 0.3f,
+                    int     detections               = 100,
+                    bool    soft_nms_enabled         = false,
+                    NMSType soft_nms_method          = NMSType::LINEAR,
+                    float   soft_nms_sigma           = 0.5f,
+                    float   soft_nms_min_score_thres = 0.001f,
+                    bool    suppress_size            = false,
+                    float   min_size                 = 1.0f,
+                    float   im_width                 = 1.0f,
+                    float   im_height                = 1.0f)
+        : _score_thresh(score_thresh),
+          _nms(nms),
+          _detections_per_im(detections),
+          _soft_nms_enabled(soft_nms_enabled),
+          _soft_nms_method(soft_nms_method),
+          _soft_nms_sigma(soft_nms_sigma),
+          _soft_nms_min_score_thres(soft_nms_min_score_thres),
+          _suppress_size(suppress_size),
+          _min_size(min_size),
+          _im_width(im_width),
+          _im_height(im_height)
     {
     }
     /** Get the score threshold */
@@ -686,120 +605,42 @@ private:
 };
 
 /** Padding and stride information class */
-class PadStrideInfo
+/** Padding information for 2D operations like Conv2d */
+struct Padding2D
 {
-public:
-    /** Constructor
-     *
-     * @param[in] stride_x (Optional) Stride, in elements, across x. Defaults to 1.
-     * @param[in] stride_y (Optional) Stride, in elements, across y. Defaults to 1.
-     * @param[in] pad_x    (Optional) Padding, in elements, across x. Defaults to 0.
-     * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
-     * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref FLOOR.
-     */
-    PadStrideInfo(unsigned int stride_x = 1, unsigned int stride_y = 1,
-                  unsigned int pad_x = 0, unsigned int pad_y = 0,
-                  DimensionRoundingType round = DimensionRoundingType::FLOOR)
-        : _stride(std::make_pair(stride_x, stride_y)),
-          _pad_left(pad_x),
-          _pad_top(pad_y),
-          _pad_right(pad_x),
-          _pad_bottom(pad_y),
-          _round_type(round)
+    Padding2D() = default;
+    Padding2D(size_t left, size_t right, size_t top, size_t bottom) : left(left), right(right), top(top), bottom(bottom)
     {
     }
-    /** Constructor
-     *
-     * @param[in] stride_x   Stride, in elements, across x.
-     * @param[in] stride_y   Stride, in elements, across y.
-     * @param[in] pad_left   Padding across x on the left, in elements.
-     * @param[in] pad_top    Padding across y on the top, in elements.
-     * @param[in] pad_right  Padding across x on the right, in elements.
-     * @param[in] pad_bottom Padding across y on the bottom, in elements.
-     * @param[in] round      Dimensions rounding.
-     */
-    PadStrideInfo(unsigned int stride_x, unsigned int stride_y,
-                  unsigned int pad_left, unsigned int pad_right,
-                  unsigned int pad_top, unsigned int pad_bottom,
-                  DimensionRoundingType round)
-        : _stride(std::make_pair(stride_x, stride_y)),
-          _pad_left(pad_left),
-          _pad_top(pad_top),
-          _pad_right(pad_right),
-          _pad_bottom(pad_bottom),
-          _round_type(round)
-    {
-    }
-    /** Get the stride.
-     *
-     * @return a pair: stride x, stride y.
-     */
-    std::pair<unsigned int, unsigned int> stride() const
-    {
-        return _stride;
-    }
-    /** Check whether the padding is symmetric.
-     *
-     * @return True if the padding is symmetric.
-     */
-    bool padding_is_symmetric() const
-    {
-        return (_pad_left == _pad_right) && (_pad_top == _pad_bottom);
-    }
-    /** Get the padding.
-     *
-     * @note This should only be used when the padding is symmetric.
-     *
-     * @return a pair: padding left/right, padding top/bottom
-     */
-    std::pair<unsigned int, unsigned int> pad() const
-    {
-        //this accessor should be used only when padding is symmetric
-        ARM_COMPUTE_ERROR_ON(!padding_is_symmetric());
-        return std::make_pair(_pad_left, _pad_top);
-    }
+    size_t left   = {0}; /**<  Padding across the width dimension on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimension on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimension on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimension on the bottom, in elements. */
+};
 
-    /** Get the left padding */
-    unsigned int pad_left() const
-    {
-        return _pad_left;
-    }
-    /** Get the right padding */
-    unsigned int pad_right() const
-    {
-        return _pad_right;
-    }
-    /** Get the top padding */
-    unsigned int pad_top() const
-    {
-        return _pad_top;
-    }
-    /** Get the bottom padding */
-    unsigned int pad_bottom() const
+/** Padding information for 3D operations like Conv3d */
+struct Padding3D
+{
+    Padding3D() noexcept
     {
-        return _pad_bottom;
     }
 
-    /** Get the rounding type */
-    DimensionRoundingType round() const
+    Padding3D(size_t pad_x, size_t pad_y, size_t pad_z)
+        : left(pad_x), right(pad_x), top(pad_y), bottom(pad_y), front(pad_z), back(pad_z)
     {
-        return _round_type;
     }
 
-    /** Check whether this has any padding */
-    bool has_padding() const
+    Padding3D(size_t left, size_t right, size_t top, size_t bottom, size_t front, size_t back)
+        : left(left), right(right), top(top), bottom(bottom), front(front), back(back)
     {
-        return (_pad_left != 0 || _pad_top != 0 || _pad_right != 0 || _pad_bottom != 0);
     }
 
-private:
-    std::pair<unsigned int, unsigned int> _stride;
-    unsigned int _pad_left;
-    unsigned int _pad_top;
-    unsigned int _pad_right;
-    unsigned int _pad_bottom;
-
-    DimensionRoundingType _round_type;
+    size_t left   = {0}; /**<  Padding across the width dimenstion on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimenstion on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimenstion  on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimenstion on the bottom, in elements. */
+    size_t front  = {0}; /**<  Padding across the depth dimenstion on the front, in elements. */
+    size_t back   = {0}; /**<  Padding across the depth dimenstion on the back, in elements. */
 };
 
 /** PriorBox layer info */
@@ -831,9 +672,15 @@ public:
      * @param[in] img_size      (Optional) Image size.
      * @param[in] steps         (Optional) Step values.
      */
-    PriorBoxLayerInfo(const std::vector<float> &min_sizes, const std::vector<float> &variances, float offset, bool flip = true, bool clip = false,
-                      const std::vector<float> &max_sizes = {}, const std::vector<float> &aspect_ratios = {},
-    const Coordinates2D &img_size = Coordinates2D{ 0, 0 }, const std::array<float, 2> &steps = { { 0.f, 0.f } })
+    PriorBoxLayerInfo(const std::vector<float>   &min_sizes,
+                      const std::vector<float>   &variances,
+                      float                       offset,
+                      bool                        flip          = true,
+                      bool                        clip          = false,
+                      const std::vector<float>   &max_sizes     = {},
+                      const std::vector<float>   &aspect_ratios = {},
+                      const Coordinates2D        &img_size      = Coordinates2D{0, 0},
+                      const std::array<float, 2> &steps         = {{0.f, 0.f}})
         : _min_sizes(min_sizes),
           _variances(variances),
           _offset(offset),
@@ -845,22 +692,22 @@ public:
           _steps(steps)
     {
         _aspect_ratios.push_back(1.);
-        for(unsigned int i = 0; i < aspect_ratios.size(); ++i)
+        for (unsigned int i = 0; i < aspect_ratios.size(); ++i)
         {
             float ar            = aspect_ratios[i];
             bool  already_exist = false;
-            for(auto ar_new : _aspect_ratios)
+            for (auto ar_new : _aspect_ratios)
             {
-                if(fabs(ar - ar_new) < 1e-6)
+                if (fabs(ar - ar_new) < 1e-6)
                 {
                     already_exist = true;
                     break;
                 }
             }
-            if(!already_exist)
+            if (!already_exist)
             {
                 _aspect_ratios.push_back(ar);
-                if(flip)
+                if (flip)
                 {
                     _aspect_ratios.push_back(1.f / ar);
                 }
@@ -914,14 +761,14 @@ public:
     }
 
 private:
-    std::vector<float> _min_sizes;
-    std::vector<float> _variances;
-    float              _offset;
-    bool               _flip;
-    bool               _clip;
-    std::vector<float> _max_sizes;
-    std::vector<float> _aspect_ratios;
-    Coordinates2D      _img_size;
+    std::vector<float>   _min_sizes;
+    std::vector<float>   _variances;
+    float                _offset;
+    bool                 _flip;
+    bool                 _clip;
+    std::vector<float>   _max_sizes;
+    std::vector<float>   _aspect_ratios;
+    Coordinates2D        _img_size;
     std::array<float, 2> _steps;
 };
 
@@ -972,8 +819,16 @@ public:
      * @param[in] variance_encoded_in_target (Optional) If true, variance is encoded in target. Otherwise we need to adjust the predicted offset accordingly.Default set to false.
      * @param[in] eta                        (Optional) Eta.
      */
-    DetectionOutputLayerInfo(int num_classes, bool share_location, DetectionOutputLayerCodeType code_type, int keep_top_k, float nms_threshold, int top_k = -1, int background_label_id = -1,
-                             float confidence_threshold = std::numeric_limits<float>::lowest(), bool variance_encoded_in_target = false, float eta = 1)
+    DetectionOutputLayerInfo(int                          num_classes,
+                             bool                         share_location,
+                             DetectionOutputLayerCodeType code_type,
+                             int                          keep_top_k,
+                             float                        nms_threshold,
+                             int                          top_k                = -1,
+                             int                          background_label_id  = -1,
+                             float                        confidence_threshold = std::numeric_limits<float>::lowest(),
+                             bool                         variance_encoded_in_target = false,
+                             float                        eta                        = 1)
         : _num_classes(num_classes),
           _share_location(share_location),
           _code_type(code_type),
@@ -1087,8 +942,15 @@ public:
      * @param[in] detection_per_class       (Optional) Number of detection per class. Used in the Regular Non-Max-Suppression. Defaults to 100.
      * @param[in] dequantize_scores         (Optional) If the scores need to be dequantized. Defaults to true.
      */
-    DetectionPostProcessLayerInfo(unsigned int max_detections, unsigned int max_classes_per_detection, float nms_score_threshold, float iou_threshold, unsigned int num_classes,
-                                  std::array<float, 4> scales_values, bool use_regular_nms = false, unsigned int detection_per_class = 100, bool dequantize_scores = true)
+    DetectionPostProcessLayerInfo(unsigned int         max_detections,
+                                  unsigned int         max_classes_per_detection,
+                                  float                nms_score_threshold,
+                                  float                iou_threshold,
+                                  unsigned int         num_classes,
+                                  std::array<float, 4> scales_values,
+                                  bool                 use_regular_nms     = false,
+                                  unsigned int         detection_per_class = 100,
+                                  bool                 dequantize_scores   = true)
         : _max_detections(max_detections),
           _max_classes_per_detection(max_classes_per_detection),
           _nms_score_threshold(nms_score_threshold),
@@ -1166,15 +1028,15 @@ public:
     }
 
 private:
-    unsigned int _max_detections;
-    unsigned int _max_classes_per_detection;
-    float        _nms_score_threshold;
-    float        _iou_threshold;
-    unsigned int _num_classes;
+    unsigned int         _max_detections;
+    unsigned int         _max_classes_per_detection;
+    float                _nms_score_threshold;
+    float                _iou_threshold;
+    unsigned int         _num_classes;
     std::array<float, 4> _scales_values;
-    bool         _use_regular_nms;
-    unsigned int _detection_per_class;
-    bool         _dequantize_scores;
+    bool                 _use_regular_nms;
+    unsigned int         _detection_per_class;
+    bool                 _dequantize_scores;
 };
 
 /** Pooling Layer Information struct*/
@@ -1188,7 +1050,9 @@ struct PoolingLayerInfo
           pad_stride_info(PadStrideInfo()),
           exclude_padding(false),
           is_global_pooling(false),
-          fp_mixed_precision(false)
+          fp_mixed_precision(false),
+          use_inf_as_limit(true),
+          use_kernel_indices(false)
     {
     }
     /** Constructor
@@ -1201,20 +1065,26 @@ struct PoolingLayerInfo
      *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
      *                               Defaults to false;
      * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] use_inf_as_limit   (Optional) Use inf to represent the limits of datatypes range, instead of  using "lowest" property of the data type.
+     * @param[in] use_kernel_indices (Optional) Use kernel indices instead of using source indices while computing indices tensor.
      */
     explicit PoolingLayerInfo(PoolingType   pool_type,
                               unsigned int  pool_size,
                               DataLayout    data_layout,
                               PadStrideInfo pad_stride_info    = PadStrideInfo(),
                               bool          exclude_padding    = false,
-                              bool          fp_mixed_precision = false)
+                              bool          fp_mixed_precision = false,
+                              bool          use_inf_as_limit   = true,
+                              bool          use_kernel_indices = false)
         : pool_type(pool_type),
           pool_size(Size2D(pool_size, pool_size)),
           data_layout(data_layout),
           pad_stride_info(pad_stride_info),
           exclude_padding(exclude_padding),
           is_global_pooling(false),
-          fp_mixed_precision(fp_mixed_precision)
+          fp_mixed_precision(fp_mixed_precision),
+          use_inf_as_limit(use_inf_as_limit),
+          use_kernel_indices(use_kernel_indices)
     {
     }
 
@@ -1228,20 +1098,26 @@ struct PoolingLayerInfo
      *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
      *                               Defaults to false;
      * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] use_inf_as_limit   (Optional) Use inf to represent the limits of datatypes range, instead of  using "lowest" property of the data type.
+     * @param[in] use_kernel_indices (Optional) Use kernel indices instead of using source indices while computing indices tensor.
      */
     explicit PoolingLayerInfo(PoolingType   pool_type,
                               Size2D        pool_size,
                               DataLayout    data_layout,
                               PadStrideInfo pad_stride_info    = PadStrideInfo(),
                               bool          exclude_padding    = false,
-                              bool          fp_mixed_precision = false)
+                              bool          fp_mixed_precision = false,
+                              bool          use_inf_as_limit   = true,
+                              bool          use_kernel_indices = false)
         : pool_type(pool_type),
           pool_size(pool_size),
           data_layout(data_layout),
           pad_stride_info(pad_stride_info),
           exclude_padding(exclude_padding),
           is_global_pooling(false),
-          fp_mixed_precision(fp_mixed_precision)
+          fp_mixed_precision(fp_mixed_precision),
+          use_inf_as_limit(use_inf_as_limit),
+          use_kernel_indices(use_kernel_indices)
     {
     }
 
@@ -1259,7 +1135,9 @@ struct PoolingLayerInfo
           pad_stride_info(PadStrideInfo(1, 1, 0, 0)),
           exclude_padding(false),
           is_global_pooling(true),
-          fp_mixed_precision(false)
+          fp_mixed_precision(false),
+          use_inf_as_limit(true),
+          use_kernel_indices(false)
     {
     }
 
@@ -1270,6 +1148,111 @@ struct PoolingLayerInfo
     bool          exclude_padding;
     bool          is_global_pooling;
     bool          fp_mixed_precision;
+    bool          use_inf_as_limit;
+    bool          use_kernel_indices;
+};
+
+/** Pooling Layer Information struct*/
+struct Pooling3dLayerInfo
+{
+    /** Default Constructor */
+    Pooling3dLayerInfo() noexcept
+        : pool_type(PoolingType::MAX),
+          pool_size(Size3D()),
+          stride(Size3D()),
+          padding(Padding3D()),
+          exclude_padding(false),
+          is_global_pooling(false),
+          fp_mixed_precision(false),
+          round_type(DimensionRoundingType::FLOOR)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] pool_type          Pooling type @ref PoolingType.
+     * @param[in] pool_size          Pooling size, in elements, across x, y and z.
+     * @param[in] stride             (Optional) stride information @ref Size3D
+     * @param[in] padding            (Optional) padding information @ref Padding3D
+     * @param[in] exclude_padding    (Optional) Strategy when accounting padding in calculations.
+     *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
+     *                               Defaults to false;
+     * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] round_type         (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR
+     */
+    explicit Pooling3dLayerInfo(PoolingType           pool_type,
+                                unsigned int          pool_size,
+                                Size3D                stride             = Size3D(1U, 1U, 1U),
+                                Padding3D             padding            = Padding3D(),
+                                bool                  exclude_padding    = false,
+                                bool                  fp_mixed_precision = false,
+                                DimensionRoundingType round_type         = DimensionRoundingType::FLOOR)
+        : pool_type(pool_type),
+          pool_size(Size3D(pool_size, pool_size, pool_size)),
+          stride(stride),
+          padding(padding),
+          exclude_padding(exclude_padding),
+          is_global_pooling(false),
+          fp_mixed_precision(fp_mixed_precision),
+          round_type(round_type)
+    {
+    }
+
+    /** Constructor
+     *
+     * @param[in] pool_type          Pooling type @ref PoolingType.
+     * @param[in] pool_size          Pooling size, in elements, across  x, y and z.
+     * @param[in] stride             (Optional) stride information @ref Size3D
+     * @param[in] padding            (Optional) padding information @ref Padding3D
+     * @param[in] exclude_padding    (Optional) Strategy when accounting padding in calculations.
+     *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
+     *                               Defaults to false;
+     * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] round_type         (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR
+     */
+    explicit Pooling3dLayerInfo(PoolingType           pool_type,
+                                Size3D                pool_size,
+                                Size3D                stride             = Size3D(1U, 1U, 1U),
+                                Padding3D             padding            = Padding3D(),
+                                bool                  exclude_padding    = false,
+                                bool                  fp_mixed_precision = false,
+                                DimensionRoundingType round_type         = DimensionRoundingType::FLOOR)
+        : pool_type(pool_type),
+          pool_size(pool_size),
+          stride(stride),
+          padding(padding),
+          exclude_padding(exclude_padding),
+          is_global_pooling(false),
+          fp_mixed_precision(fp_mixed_precision),
+          round_type(round_type)
+    {
+    }
+
+    /** Constructor
+     *
+     * @note This constructor is used for global pooling
+     *
+     * @param[in] pool_type Pooling type @ref PoolingType.
+     */
+    explicit Pooling3dLayerInfo(PoolingType pool_type)
+        : pool_type(pool_type),
+          pool_size(Size3D()),
+          stride(Size3D(1U, 1U, 1U)),
+          padding(Padding3D(0, 0, 0)),
+          exclude_padding(false),
+          is_global_pooling(true),
+          fp_mixed_precision(false),
+          round_type(DimensionRoundingType::FLOOR)
+    {
+    }
+
+    PoolingType           pool_type;
+    Size3D                pool_size;
+    Size3D                stride;
+    Padding3D             padding;
+    bool                  exclude_padding;
+    bool                  is_global_pooling;
+    bool                  fp_mixed_precision;
+    DimensionRoundingType round_type;
 };
 
 /** ROI Pooling Layer Information class */
@@ -1283,8 +1266,14 @@ public:
      * @param[in] spatial_scale  Spatial scale to be applied to the ROI coordinates and dimensions.
      * @param[in] sampling_ratio Number of samples to include in each pooling region (if set to zero, a ceil(roi_dims/pooling_dims))
      */
-    ROIPoolingLayerInfo(unsigned int pooled_width, unsigned int pooled_height, float spatial_scale, unsigned int sampling_ratio = 0)
-        : _pooled_width(pooled_width), _pooled_height(pooled_height), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio)
+    ROIPoolingLayerInfo(unsigned int pooled_width,
+                        unsigned int pooled_height,
+                        float        spatial_scale,
+                        unsigned int sampling_ratio = 0)
+        : _pooled_width(pooled_width),
+          _pooled_height(pooled_height),
+          _spatial_scale(spatial_scale),
+          _sampling_ratio(sampling_ratio)
     {
     }
     /** Get the pooled width of the layer */
@@ -1331,10 +1320,24 @@ public:
      * @param[in] min_size       (Optional)Size used to validate the anchors produced. Defaults to 16.
      * @param[in] values_per_roi (Optional)Values used to represent a ROI(Region of interest). Defaults to 4.
      */
-    GenerateProposalsInfo(float im_width, float im_height, float im_scale, float spatial_scale = 1.0, int pre_nms_topN = 6000, int post_nms_topN = 300, float nms_thres = 0.7, float min_size = 16.0,
+    GenerateProposalsInfo(float  im_width,
+                          float  im_height,
+                          float  im_scale,
+                          float  spatial_scale  = 1.0,
+                          int    pre_nms_topN   = 6000,
+                          int    post_nms_topN  = 300,
+                          float  nms_thres      = 0.7,
+                          float  min_size       = 16.0,
                           size_t values_per_roi = 4)
-        : _im_height(im_height), _im_width(im_width), _im_scale(im_scale), _spatial_scale(spatial_scale), _pre_nms_topN(pre_nms_topN), _post_nms_topN(post_nms_topN), _nms_thres(nms_thres),
-          _min_size(min_size), _values_per_roi(values_per_roi)
+        : _im_height(im_height),
+          _im_width(im_width),
+          _im_scale(im_scale),
+          _spatial_scale(spatial_scale),
+          _pre_nms_topN(pre_nms_topN),
+          _post_nms_topN(post_nms_topN),
+          _nms_thres(nms_thres),
+          _min_size(min_size),
+          _values_per_roi(values_per_roi)
     {
     }
 
@@ -1460,11 +1463,20 @@ public:
      * @param[in] correct_transform_coords (Optional)Correct bounding box transform coordinates. Defaults to false
      * @param[in] bbox_xform_clip          (Optional)Minimum bounding box width and height after bounding box transformation in log-space. Defaults to log(1000/16)
      */
-    BoundingBoxTransformInfo(float img_width, float img_height, float scale, bool apply_scale = false, const std::array<float, 4> weights = { { 1.f, 1.f, 1.f, 1.f } }, bool correct_transform_coords =
-    false,
-    float bbox_xform_clip =
-        4.135166556742356f)
-        : _img_width(img_width), _img_height(img_height), _scale(scale), _apply_scale(apply_scale), _correct_transform_coords(correct_transform_coords), _weights(weights), _bbox_xform_clip(bbox_xform_clip)
+    BoundingBoxTransformInfo(float                      img_width,
+                             float                      img_height,
+                             float                      scale,
+                             bool                       apply_scale              = false,
+                             const std::array<float, 4> weights                  = {{1.f, 1.f, 1.f, 1.f}},
+                             bool                       correct_transform_coords = false,
+                             float                      bbox_xform_clip          = 4.135166556742356f)
+        : _img_width(img_width),
+          _img_height(img_height),
+          _scale(scale),
+          _apply_scale(apply_scale),
+          _correct_transform_coords(correct_transform_coords),
+          _weights(weights),
+          _bbox_xform_clip(bbox_xform_clip)
     {
     }
 
@@ -1504,110 +1516,13 @@ public:
     }
 
 private:
-    float _img_width;
-    float _img_height;
-    float _scale;
-    bool  _apply_scale;
-    bool  _correct_transform_coords;
+    float                _img_width;
+    float                _img_height;
+    float                _scale;
+    bool                 _apply_scale;
+    bool                 _correct_transform_coords;
     std::array<float, 4> _weights;
-    float _bbox_xform_clip;
-};
-
-/** Activation Layer Information class */
-class ActivationLayerInfo
-{
-public:
-    /** Available activation functions */
-    enum class ActivationFunction
-    {
-        LOGISTIC,        /**< Logistic ( \f$ f(x) = \frac{1}{1 + e^{-x}} \f$ ) */
-        TANH,            /**< Hyperbolic tangent ( \f$ f(x) = a \cdot tanh(b \cdot x) \f$ ) */
-        RELU,            /**< Rectifier ( \f$ f(x) = max(0,x) \f$ ) */
-        BOUNDED_RELU,    /**< Upper Bounded Rectifier ( \f$ f(x) = min(a, max(0,x)) \f$ ) */
-        LU_BOUNDED_RELU, /**< Lower and Upper Bounded Rectifier ( \f$ f(x) = min(a, max(b,x)) \f$ ) */
-        LEAKY_RELU,      /**< Leaky Rectifier ( \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-        SOFT_RELU,       /**< Soft Rectifier ( \f$ f(x)= log(1+e^x) \f$ ) */
-        ELU,             /**< Exponential Linear Unit ( \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-        ABS,             /**< Absolute ( \f$ f(x)= |x| \f$ ) */
-        SQUARE,          /**< Square ( \f$ f(x)= x^2 \f$ )*/
-        SQRT,            /**< Square root ( \f$ f(x) = \sqrt{x} \f$ )*/
-        LINEAR,          /**< Linear ( \f$ f(x)= ax + b \f$ ) */
-        IDENTITY,        /**< Identity ( \f$ f(x)= x \f$ ) */
-        HARD_SWISH       /**< Hard-swish ( \f$ f(x) = (x * relu6(x+3))/6 \f$ ) */
-    };
-
-    ActivationLayerInfo() = default;
-    /** Default Constructor
-     *
-     * @param[in] f The activation function to use.
-     * @param[in] a (Optional) The alpha parameter used by some activation functions
-     *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
-     * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::TANH).
-     */
-    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f)
-        : _act(f), _a(a), _b(b), _enabled(true)
-    {
-    }
-    /** Get the type of activation function */
-    ActivationFunction activation() const
-    {
-        return _act;
-    }
-    /** Get the alpha value */
-    float a() const
-    {
-        return _a;
-    }
-    /** Get the beta value */
-    float b() const
-    {
-        return _b;
-    }
-    /** Check if initialised */
-    bool enabled() const
-    {
-        return _enabled;
-    }
-
-private:
-    ActivationFunction _act     = { ActivationLayerInfo::ActivationFunction::IDENTITY };
-    float              _a       = {};
-    float              _b       = {};
-    bool               _enabled = { false };
-};
-
-/** Fully connected layer info */
-struct FullyConnectedLayerInfo
-{
-    DataLayout          weights_trained_layout{ DataLayout::NCHW }; /**<  Layout that the weights have been trained with. */
-    bool                transpose_weights{ true };                  /**<  Transpose weights if true. */
-    bool                are_weights_reshaped{ false };              /**<  Reshape the weights tensor if false. */
-    bool                retain_internal_weights{ false };           /**<  Retain internal reshaped weights. */
-    bool                fp_mixed_precision{ false };                /**<  Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */
-    ActivationLayerInfo activation_info{};                          /**<  Fused activation to apply after the matrix multiplication. */
-
-    /** Sets the weights trained data layout
-     *
-     * @param[in] layout Data layout that the weights were trained with
-     *
-     * @return Updated object
-     */
-    FullyConnectedLayerInfo &set_weights_trained_layout(DataLayout layout)
-    {
-        weights_trained_layout = layout;
-        return *this;
-    }
-    /** Sets the transpose weights flag
-     *
-     * @param[in] should_transpose_weights Boolean flag indicating if weights should be transposed
-     *
-     * @return Updated object
-     */
-    FullyConnectedLayerInfo &set_transpose_weights(bool should_transpose_weights)
-    {
-        transpose_weights = should_transpose_weights;
-        return *this;
-    }
+    float                _bbox_xform_clip;
 };
 
 /** Normalization Layer Information class */
@@ -1624,7 +1539,12 @@ public:
      * @param[in] is_scaled (Optional) Boolean that specifies if alpha will be scaled by the normalization size or not.
      *                      Should be false to follow [Krichevksy 2012].
      */
-    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f, bool is_scaled = true)
+    NormalizationLayerInfo(NormType type,
+                           uint32_t norm_size = 5,
+                           float    alpha     = 0.0001f,
+                           float    beta      = 0.5f,
+                           float    kappa     = 1.f,
+                           bool     is_scaled = true)
         : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa), _is_scaled(is_scaled)
     {
     }
@@ -1690,13 +1610,74 @@ private:
     bool     _is_scaled;
 };
 
+class StridedSliceLayerInfo
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     */
+    StridedSliceLayerInfo(int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0)
+        : _begin_mask(begin_mask), _end_mask(end_mask), _shrink_axis_mask(shrink_axis_mask)
+    {
+    }
+
+    /* Get the begin mask value */
+    int32_t begin_mask() const
+    {
+        return _begin_mask;
+    }
+
+    /* Get the end mask value */
+    int32_t end_mask() const
+    {
+        return _end_mask;
+    }
+
+    /* Get the shrink axis mask value */
+    int32_t shrink_axis_mask() const
+    {
+        return _shrink_axis_mask;
+    }
+
+private:
+    int32_t _begin_mask;
+    int32_t _end_mask;
+    int32_t _shrink_axis_mask;
+};
+
+// OHWIo<interleave_by>i<block_by>
+inline int interleave_by(const WeightFormat wf)
+{
+    return (static_cast<int>(wf) >> 8) & 0xFFF;
+}
+inline int block_by(const WeightFormat wf)
+{
+    return (static_cast<int>(wf) >> 20) & 0xF;
+}
+inline bool is_fixed_format(const WeightFormat &wf)
+{
+    return wf != WeightFormat::UNSPECIFIED && wf != WeightFormat::ANY;
+}
+inline bool is_fixed_format_fast_math(const WeightFormat &wf)
+{
+    return (static_cast<int>(wf) >> 4) & 0x1;
+}
+
 /** Convolution Layer Weights Information class. This class stores the necessary information to compute convolution layer when the weights are already reshaped */
 class WeightsInfo
 {
 public:
     /** Default constructor */
     WeightsInfo()
-        : _are_reshaped(false), _kernel_width(0), _kernel_height(0), _num_kernels(0), _retain_internal_weights(false)
+        : _are_reshaped(false),
+          _kernel_width(0),
+          _kernel_height(0),
+          _num_kernels(0),
+          _retain_internal_weights(false),
+          _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
     {
     }
     /** Constructor
@@ -1706,9 +1687,20 @@ public:
      * @param[in] kernel_height           Kernel height.
      * @param[in] num_kernels             Number of convolution kernels.
      * @param[in] retain_internal_weights (Optional) True if internal reshaped weights must be retained. Used for reconfiguration purposes. Default is false.
+     * @param[in] weight_format           (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
      */
-    WeightsInfo(bool are_reshaped, unsigned int kernel_width, unsigned int kernel_height, unsigned int num_kernels, bool retain_internal_weights = false)
-        : _are_reshaped(are_reshaped), _kernel_width(kernel_width), _kernel_height(kernel_height), _num_kernels(num_kernels), _retain_internal_weights(retain_internal_weights)
+    WeightsInfo(bool                      are_reshaped,
+                unsigned int              kernel_width,
+                unsigned int              kernel_height,
+                unsigned int              num_kernels,
+                bool                      retain_internal_weights = false,
+                arm_compute::WeightFormat weight_format           = arm_compute::WeightFormat::UNSPECIFIED)
+        : _are_reshaped(are_reshaped),
+          _kernel_width(kernel_width),
+          _kernel_height(kernel_height),
+          _num_kernels(num_kernels),
+          _retain_internal_weights(retain_internal_weights),
+          _weight_format(weight_format)
     {
     }
     /** Flag which specifies if the weights tensor has been reshaped.
@@ -1739,22 +1731,40 @@ public:
     {
         return _retain_internal_weights;
     }
+    arm_compute::WeightFormat weight_format() const
+    {
+        return _weight_format;
+    }
+    void set_weight_format(arm_compute::WeightFormat weight_format)
+    {
+        _weight_format = weight_format;
+    }
+
+    unsigned int kernel_width() const
+    {
+        return _kernel_width;
+    }
+    unsigned int kernel_height() const
+    {
+        return _kernel_height;
+    }
 
 private:
-    const bool         _are_reshaped;
-    const unsigned int _kernel_width;
-    const unsigned int _kernel_height;
-    const unsigned int _num_kernels;
-    const bool         _retain_internal_weights;
+    bool                      _are_reshaped;
+    unsigned int              _kernel_width;
+    unsigned int              _kernel_height;
+    unsigned int              _num_kernels;
+    bool                      _retain_internal_weights;
+    arm_compute::WeightFormat _weight_format;
 };
 
 /** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
  *
- * The matrix A can only be reshaped through @ref CLGEMMReshapeLHSMatrixKernel or  @ref NEGEMMInterleave4x4Kernel or  @ref GCGEMMInterleave4x4Kernel
- * Note: Optionally just for @ref CLGEMMReshapeLHSMatrixKernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
+ * The matrix A can only be reshaped through @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel or  @ref cpu::kernels::CpuGemmInterleave4x4Kernel
+ * Note: Optionally just for @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
  *
- * The matrix B can only be reshaped through @ref CLGEMMReshapeRHSMatrixKernel or  @ref NEGEMMTranspose1xWKernel or  @ref GCGEMMTranspose1xWKernel
- * Note: Optionally just for @ref CLGEMMReshapeRHSMatrixKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
+ * The matrix B can only be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel or  @ref cpu::kernels::CpuGemmTranspose1xWKernel
+ * Note: Optionally just for @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
  *
  */
 class GEMMReshapeInfo final
@@ -1762,7 +1772,14 @@ class GEMMReshapeInfo final
 public:
     /** Default constructor */
     GEMMReshapeInfo()
-        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(0), _reinterpret_input_as_3d(false), _broadcast_bias(false)
+        : _m(1),
+          _n(1),
+          _k(1),
+          _mult_transpose1xW_width(1),
+          _mult_interleave4x4_height(1),
+          _depth_output_gemm3d(0),
+          _reinterpret_input_as_3d(false),
+          _broadcast_bias(false)
     {
     }
     /** Constructor
@@ -1778,9 +1795,22 @@ public:
      *                                      to perform 1x1 convolutions with the NHWC data layout)
      * @param[in] broadcast_bias            (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      */
-    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool broadcast_bias = false)
-        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height), _depth_output_gemm3d(depth_output_gemm3d),
-          _reinterpret_input_as_3d(reinterpret_input_as_3d), _broadcast_bias(broadcast_bias)
+    GEMMReshapeInfo(int  m,
+                    int  n,
+                    int  k,
+                    int  mult_transpose1xW_width   = 1,
+                    int  mult_interleave4x4_height = 1,
+                    int  depth_output_gemm3d       = 0,
+                    bool reinterpret_input_as_3d   = false,
+                    bool broadcast_bias            = false)
+        : _m(m),
+          _n(n),
+          _k(k),
+          _mult_transpose1xW_width(mult_transpose1xW_width),
+          _mult_interleave4x4_height(mult_interleave4x4_height),
+          _depth_output_gemm3d(depth_output_gemm3d),
+          _reinterpret_input_as_3d(reinterpret_input_as_3d),
+          _broadcast_bias(broadcast_bias)
     {
     }
     /** Number of matrix A rows
@@ -1852,45 +1882,14 @@ public:
     };
 
 private:
-    const int  _m;
-    const int  _n;
-    const int  _k;
-    const int  _mult_transpose1xW_width;
-    const int  _mult_interleave4x4_height;
-    const int  _depth_output_gemm3d;
-    const bool _reinterpret_input_as_3d;
-    const bool _broadcast_bias;
-};
-
-struct DepthwiseConvolutionReshapeInfo
-{
-    unsigned int c0{ 1 };            /**< Number of channels processed by the depth-wise convolution */
-    bool         transpose{ false }; /**< True if the block MxC0 (where M is the area of the filter i.e. KwxKh) has to be transposed */
-};
-
-/** GEMMLowp output stage type */
-enum class GEMMLowpOutputStageType
-{
-    NONE,                     /**< No quantization */
-    QUANTIZE_DOWN,            /**< Quantize using an integer multiplication */
-    QUANTIZE_DOWN_FIXEDPOINT, /**< Quantize using a fixed point multiplication */
-    QUANTIZE_DOWN_FLOAT       /**< Quantize using a floating point multiplication */
-};
-
-/** GEMMLowp output stage info */
-struct GEMMLowpOutputStageInfo
-{
-    GEMMLowpOutputStageType type{ GEMMLowpOutputStageType::NONE };                        /**< GEMMLowp output stage type */
-    int32_t                 gemmlowp_offset{ 0 };                                         /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_multiplier{ 0 };                                     /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_shift{ 0 };                                          /**< GEMMLowp output stage shift used for quantizing to uint8 */
-    int32_t                 gemmlowp_min_bound{ std::numeric_limits<int32_t>::lowest() }; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
-    int32_t                 gemmlowp_max_bound{ std::numeric_limits<int32_t>::max() };    /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_multipliers{};                                       /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_shifts{};                                            /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    float                   gemmlowp_real_multiplier{ 0 };                                /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
-    bool                    is_quantized_per_channel{ false };                            /**< GEMMLowp quantized per-channel flag */
-    DataType                output_data_type{ DataType::UNKNOWN };                        /**< Output tensor data type to use if the output is not initialized */
+    int  _m;
+    int  _n;
+    int  _k;
+    int  _mult_transpose1xW_width;
+    int  _mult_interleave4x4_height;
+    int  _depth_output_gemm3d;
+    bool _reinterpret_input_as_3d;
+    bool _broadcast_bias;
 };
 
 /** GEMM LHS (Left Hand Side) matrix information */
@@ -1901,211 +1900,31 @@ struct GEMMLHSMatrixInfo
         : m0(m), k0(k), v0(v), transpose(trans), interleave(inter)
     {
     }
-    unsigned int m0{ 1 };            /**< Number of rows processed by the matrix multiplication */
-    unsigned int k0{ 1 };            /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int v0{ 1 };            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    bool         transpose{ true };  /**< True if the (m0xk0) block has to be transposed before been stored */
-    bool         interleave{ true }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    unsigned int m0{1};            /**< Number of rows processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1};            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (m0xk0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
 };
 
 /** GEMM RHS (Right Hand Side) matrix information */
 struct GEMMRHSMatrixInfo
 {
     GEMMRHSMatrixInfo() = default;
-    GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter)
-        : n0(n), k0(k), h0(h), transpose(trans), interleave(inter)
+    GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter, bool export_to_cl_img)
+        : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img)
     {
     }
-    unsigned int n0{ 1 };                     /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                     /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         transpose{ true };           /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         interleave{ true };          /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         export_to_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int n0{1};            /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1};            /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         export_to_cl_image{
+        false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
-/** GEMM information class. This class stores the necessary information to compute GEMM functions
- *
- * This object also contains the information about how matrix A and matrix B have been reshaped
- *
- */
-class GEMMInfo
-{
-public:
-    /** Default constructor */
-    GEMMInfo() noexcept
-        : _is_a_reshaped(false),
-          _is_b_reshaped(false),
-          _reshape_b_only_on_first_run(true),
-          _depth_output_gemm3d(0),
-          _reinterpret_input_as_3d(false),
-          _retain_internal_weights(false),
-          _gemmlowp_output_stage(),
-          _fp_mixed_precision(false),
-          _broadcast_bias(false),
-          _pretranpose_B(true),
-          _activation_info()
-    {
-    }
-    /** Constructor
-     *
-     * @param[in] is_a_reshaped               True if the matrix A has been reshaped
-     * @param[in] is_b_reshaped               True if the matrix B has been reshaped
-     * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
-     * @param[in] depth_output_gemm3d         (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel
-     *                                        If 0 the output will not be reinterpreted as 3D. Default 0
-     * @param[in] reinterpret_input_as_3d     (Optional) Reinterpret the input as 3D tensor. (i.e. this flag should be set to true when GEMM is used
-     *                                        to perform 1x1 convolutions with the NHWC data layout)
-     * @param[in] retain_internal_weights     (Optional) Retain the weights tensor from previous run
-     * @param[in] gemmlowp_output_stage       (Optional) GEMMLowp Output stage info
-     * @param[in] fp_mixed_precision          (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
-     * @param[in] broadcast_bias              (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
-     * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
-     */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
-             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false,
-             const ActivationLayerInfo &activation_info = ActivationLayerInfo()) noexcept
-        : _is_a_reshaped(is_a_reshaped),
-          _is_b_reshaped(is_b_reshaped),
-          _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
-          _depth_output_gemm3d(depth_output_gemm3d),
-          _reinterpret_input_as_3d(reinterpret_input_as_3d),
-          _retain_internal_weights(retain_internal_weights),
-          _gemmlowp_output_stage(gemmlowp_output_stage),
-          _fp_mixed_precision(fp_mixed_precision),
-          _broadcast_bias(broadcast_bias),
-          _pretranpose_B(reshape_b_only_on_first_run),
-          _activation_info(activation_info)
-    {
-    }
-    /** Flag which specifies if the matrix A has been reshaped
-     *
-     * @return True if the matrix A has been reshaped
-     */
-    bool is_a_reshaped() const
-    {
-        return _is_a_reshaped;
-    };
-    /** Flag which specifies if the matrix B has been reshaped
-     *
-     * @return True if the matrix B has been reshaped
-     */
-    bool is_b_reshaped() const
-    {
-        return _is_b_reshaped;
-    };
-    /** Flag which specifies if the reshape of matrix B should executed only for the first
-     *
-     * @note This flag could be set to TRUE when GEMM is used to accelerate convolution layer
-     *
-     * @return True if the reshaped of matrix B happens only for the first run
-     */
-    bool reshape_b_only_on_first_run() const
-    {
-        return _reshape_b_only_on_first_run;
-    };
-    /** Depth of the output when GEMM output is reinterpreted as 3D tensor
-     *
-     * @return the depth of the output tensor
-     */
-    int depth_output_gemm3d() const
-    {
-        return _depth_output_gemm3d;
-    };
-    /** Flag which specifies if the input tensor has to be reinterpreted as 3D
-     *
-     * @return True if the input tensor has to be reinterpreted as 3D tensor
-     */
-    bool reinterpret_input_as_3d() const
-    {
-        return _reinterpret_input_as_3d;
-    };
-    /** Flag which specifies if the weights tensor has to be retained from previous run
-     *
-     * @return True if the weights tensor has to be retained
-     */
-    bool retain_internal_weights() const
-    {
-        return _retain_internal_weights;
-    };
-    /** GEMMLowp output stage
-     *
-     * @return the GEMMLowp output stage info
-     */
-    GEMMLowpOutputStageInfo gemmlowp_output_stage() const
-    {
-        return _gemmlowp_output_stage;
-    };
-    /** Sets GEMMLowp output stage
-     *
-     * @param[in] output_stage Output stage to set
-     */
-    void set_gemmlowp_output_stage(GEMMLowpOutputStageInfo &output_stage)
-    {
-        _gemmlowp_output_stage = output_stage;
-    };
-    /** Flag which specifies if a wider accumulator should be used.
-     *
-     * @return True if a wider accumulator has to be used
-     */
-    bool fp_mixed_precision() const
-    {
-        return _fp_mixed_precision;
-    };
-    /** Flag which specifies whether to broadcast the shape of the bias tensor.
-     *
-     * @return True if the shape of the bias tensor is to be broadcasted.
-     */
-    bool broadcast_bias() const
-    {
-        return _broadcast_bias;
-    };
-    /** Flag which specifies whether b should be pre-transposed if supported.
-     *
-     * @return True if b should be pre-transposed else false.
-     */
-    bool pretranpose_B() const
-    {
-        return _pretranpose_B;
-    };
-    /** Set pre-transpose b flag
-     *
-     * @param[in] flag Flag to set
-     */
-    void set_pretranpose_B(bool flag)
-    {
-        _pretranpose_B = flag;
-    }
-    /** Activation layer to apply after the matrix multiplication
-     *
-     * @return ActivationLayerInfo object
-     */
-    ActivationLayerInfo activation_info() const
-    {
-        return _activation_info;
-    }
-    /** Set activation layer info
-     *
-     * @param[in] activation_info ActivationLayerInfo object to set
-     */
-    void set_activation_info(const ActivationLayerInfo &activation_info)
-    {
-        _activation_info = activation_info;
-    }
-
-private:
-    bool                    _is_a_reshaped;
-    bool                    _is_b_reshaped;
-    bool                    _reshape_b_only_on_first_run;
-    int                     _depth_output_gemm3d;
-    bool                    _reinterpret_input_as_3d;
-    bool                    _retain_internal_weights;
-    GEMMLowpOutputStageInfo _gemmlowp_output_stage;
-    bool                    _fp_mixed_precision;
-    bool                    _broadcast_bias;
-    bool                    _pretranpose_B;
-    ActivationLayerInfo     _activation_info;
-};
+class ITensorInfo;
 
 /** Winograd information */
 struct WinogradInfo
@@ -2118,16 +1937,23 @@ struct WinogradInfo
      * @param[in] conv_info      Convolution info (Pads, strides)
      * @param[in] data_layout    Data layout to use for the output tensor once the convolution has been applied
      */
-    WinogradInfo(Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
-        : output_tile_size(output_tile_sz), kernel_size(kernel_sz), input_dimensions(input_dims), convolution_info(conv_info), output_data_layout(data_layout)
-    {
-    }
-
-    Size2D        output_tile_size{};                     /**< Width and height of the output tile */
-    Size2D        kernel_size{};                          /**< Width and height of the kernel*/
-    Size2D        input_dimensions{};                     /**< Width and height of the input tensor before the convolution is applied */
-    PadStrideInfo convolution_info{};                     /**< Convolution info (Pads, strides,...) */
-    DataLayout    output_data_layout{ DataLayout::NCHW }; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
+    WinogradInfo(
+        Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
+        : output_tile_size(output_tile_sz),
+          kernel_size(kernel_sz),
+          input_dimensions(input_dims),
+          convolution_info(conv_info),
+          output_data_layout(data_layout)
+    {
+    }
+
+    Size2D        output_tile_size{}; /**< Width and height of the output tile */
+    Size2D        kernel_size{};      /**< Width and height of the kernel*/
+    Size2D        input_dimensions{}; /**< Width and height of the input tensor before the convolution is applied */
+    PadStrideInfo convolution_info{}; /**< Convolution info (Pads, strides,...) */
+    DataLayout    output_data_layout{
+        DataLayout::
+            NCHW}; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
 };
 
 /** IO formatting information class*/
@@ -2186,5 +2012,8 @@ struct IOFormatInfo
     /** Align columns */
     bool align_columns;
 };
+
+/** Class for holding information related to cropping */
+using CropInfo = Padding2D;
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_TYPES_H
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index eff6157b1f..a2146522f7 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,63 +26,29 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Version.h"
 
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <iomanip>
+#include <cmath>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
-#include <vector>
 
-namespace arm_compute
-{
-/** Calculate the rounded up quotient of val / m.
- *
- * @param[in] val Value to divide and round up.
- * @param[in] m   Value to divide by.
- *
- * @return the result.
- */
-template <typename S, typename T>
-constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
-{
-    return (val + m - 1) / m;
-}
-
-/** Computes the smallest number larger or equal to value that is a multiple of divisor.
- *
- * @param[in] value   Lower bound value
- * @param[in] divisor Value to compute multiple of.
- *
- * @return the result.
- */
-template <typename S, typename T>
-inline auto ceil_to_multiple(S value, T divisor) -> decltype(((value + divisor - 1) / divisor) * divisor)
-{
-    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
-    return DIV_CEIL(value, divisor) * divisor;
-}
+/* Convenience / backwards compatibility includes */
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/DataLayoutUtils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/FormatUtils.h"
+#include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
-/** Computes the largest number smaller or equal to value that is a multiple of divisor.
- *
- * @param[in] value   Upper bound value
- * @param[in] divisor Value to compute multiple of.
- *
- * @return the result.
- */
-template <typename S, typename T>
-inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor) * divisor)
+namespace arm_compute
 {
-    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
-    return (value / divisor) * divisor;
-}
+class ITensor;
+class ITensorInfo;
+class ActivationLayerInfo;
 
 /** Load an entire file in memory
  *
@@ -93,814 +59,6 @@ inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor)
  */
 std::string read_file(const std::string &filename, bool binary);
 
-/** The size in bytes of the data type
- *
- * @param[in] data_type Input data type
- *
- * @return The size in bytes of the data type
- */
-inline size_t data_size_from_type(DataType data_type)
-{
-    switch(data_type)
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-            return 1;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-        case DataType::BFLOAT16:
-        case DataType::F16:
-            return 2;
-        case DataType::F32:
-        case DataType::U32:
-        case DataType::S32:
-            return 4;
-        case DataType::F64:
-        case DataType::U64:
-        case DataType::S64:
-            return 8;
-        case DataType::SIZET:
-            return sizeof(size_t);
-        default:
-            ARM_COMPUTE_ERROR("Invalid data type");
-            return 0;
-    }
-}
-
-/** The size in bytes of the pixel format
- *
- * @param[in] format Input format
- *
- * @return The size in bytes of the pixel format
- */
-inline size_t pixel_size_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-            return 1;
-        case Format::U16:
-        case Format::S16:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::UV88:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 2;
-        case Format::RGB888:
-            return 3;
-        case Format::RGBA8888:
-            return 4;
-        case Format::U32:
-        case Format::S32:
-        case Format::F32:
-            return 4;
-        //Doesn't make sense for planar formats:
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        case Format::YUV444:
-        default:
-            ARM_COMPUTE_ERROR("Undefined pixel size for given format");
-            return 0;
-    }
-}
-
-/** The size in bytes of the data type
- *
- * @param[in] dt Input data type
- *
- * @return The size in bytes of the data type
- */
-inline size_t element_size_from_data_type(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::S8:
-        case DataType::U8:
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-            return 1;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-        case DataType::BFLOAT16:
-        case DataType::F16:
-            return 2;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            return 4;
-        default:
-            ARM_COMPUTE_ERROR("Undefined element size for given data type");
-            return 0;
-    }
-}
-
-/** Return the data type used by a given single-planar pixel format
- *
- * @param[in] format Input format
- *
- * @return The size in bytes of the pixel format
- */
-inline DataType data_type_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-        case Format::UV88:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return DataType::U8;
-        case Format::U16:
-            return DataType::U16;
-        case Format::S16:
-            return DataType::S16;
-        case Format::U32:
-            return DataType::U32;
-        case Format::S32:
-            return DataType::S32;
-        case Format::BFLOAT16:
-            return DataType::BFLOAT16;
-        case Format::F16:
-            return DataType::F16;
-        case Format::F32:
-            return DataType::F32;
-        //Doesn't make sense for planar formats:
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        case Format::YUV444:
-        default:
-            ARM_COMPUTE_ERROR("Not supported data_type for given format");
-            return DataType::UNKNOWN;
-    }
-}
-
-/** Return the plane index of a given channel given an input format.
- *
- * @param[in] format  Input format
- * @param[in] channel Input channel
- *
- * @return The plane index of the specific channel of the specific format
- */
-inline int plane_idx_from_channel(Format format, Channel channel)
-{
-    switch(format)
-    {
-        // Single planar formats have a single plane
-        case Format::U8:
-        case Format::U16:
-        case Format::S16:
-        case Format::U32:
-        case Format::S32:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::F32:
-        case Format::UV88:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 0;
-        // Multi planar formats
-        case Format::NV12:
-        case Format::NV21:
-        {
-            // Channel U and V share the same plane of format UV88
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                case Channel::V:
-                    return 1;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::IYUV:
-        case Format::YUV444:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 1;
-                case Channel::V:
-                    return 2;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported format");
-            return 0;
-    }
-}
-
-/** Return the channel index of a given channel given an input format.
- *
- * @param[in] format  Input format
- * @param[in] channel Input channel
- *
- * @return The channel index of the specific channel of the specific format
- */
-inline int channel_idx_from_format(Format format, Channel channel)
-{
-    switch(format)
-    {
-        case Format::RGB888:
-        {
-            switch(channel)
-            {
-                case Channel::R:
-                    return 0;
-                case Channel::G:
-                    return 1;
-                case Channel::B:
-                    return 2;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::RGBA8888:
-        {
-            switch(channel)
-            {
-                case Channel::R:
-                    return 0;
-                case Channel::G:
-                    return 1;
-                case Channel::B:
-                    return 2;
-                case Channel::A:
-                    return 3;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::YUYV422:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 1;
-                case Channel::V:
-                    return 3;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::UYVY422:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 1;
-                case Channel::U:
-                    return 0;
-                case Channel::V:
-                    return 2;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::NV12:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 0;
-                case Channel::V:
-                    return 1;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::NV21:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 1;
-                case Channel::V:
-                    return 0;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::YUV444:
-        case Format::IYUV:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 0;
-                case Channel::V:
-                    return 0;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported format");
-            return 0;
-    }
-}
-
-/** Return the number of planes for a given format
- *
- * @param[in] format Input format
- *
- * @return The number of planes for a given image format.
- */
-inline size_t num_planes_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-        case Format::S16:
-        case Format::U16:
-        case Format::S32:
-        case Format::U32:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::F32:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 1;
-        case Format::NV12:
-        case Format::NV21:
-            return 2;
-        case Format::IYUV:
-        case Format::YUV444:
-            return 3;
-        default:
-            ARM_COMPUTE_ERROR("Not supported format");
-            return 0;
-    }
-}
-
-/** Return the number of channels for a given single-planar pixel format
- *
- * @param[in] format Input format
- *
- * @return The number of channels for a given image format.
- */
-inline size_t num_channels_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-        case Format::U16:
-        case Format::S16:
-        case Format::U32:
-        case Format::S32:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::F32:
-            return 1;
-        // Because the U and V channels are subsampled
-        // these formats appear like having only 2 channels:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 2;
-        case Format::UV88:
-            return 2;
-        case Format::RGB888:
-            return 3;
-        case Format::RGBA8888:
-            return 4;
-        //Doesn't make sense for planar formats:
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        case Format::YUV444:
-        default:
-            return 0;
-    }
-}
-
-/** Return the promoted data type of a given data type.
- *
- * @note If promoted data type is not supported an error will be thrown
- *
- * @param[in] dt Data type to get the promoted type of.
- *
- * @return Promoted data type
- */
-inline DataType get_promoted_data_type(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::U8:
-            return DataType::U16;
-        case DataType::S8:
-            return DataType::S16;
-        case DataType::U16:
-            return DataType::U32;
-        case DataType::S16:
-            return DataType::S32;
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-        case DataType::BFLOAT16:
-        case DataType::F16:
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            ARM_COMPUTE_ERROR("Unsupported data type promotions!");
-        default:
-            ARM_COMPUTE_ERROR("Undefined data type!");
-    }
-    return DataType::UNKNOWN;
-}
-
-/** Compute the mininum and maximum values a data type can take
- *
- * @param[in] dt Data type to get the min/max bounds of
- *
- * @return A tuple (min,max) with the minimum and maximum values respectively wrapped in PixelValue.
- */
-inline std::tuple<PixelValue, PixelValue> get_min_max(DataType dt)
-{
-    PixelValue min{};
-    PixelValue max{};
-    switch(dt)
-    {
-        case DataType::U8:
-        case DataType::QASYMM8:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()));
-            break;
-        }
-        case DataType::S8:
-        case DataType::QSYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::max()));
-            break;
-        }
-        case DataType::U16:
-        case DataType::QASYMM16:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::max()));
-            break;
-        }
-        case DataType::S16:
-        case DataType::QSYMM16:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
-            break;
-        }
-        case DataType::U32:
-        {
-            min = PixelValue(std::numeric_limits<uint32_t>::lowest());
-            max = PixelValue(std::numeric_limits<uint32_t>::max());
-            break;
-        }
-        case DataType::S32:
-        {
-            min = PixelValue(std::numeric_limits<int32_t>::lowest());
-            max = PixelValue(std::numeric_limits<int32_t>::max());
-            break;
-        }
-        case DataType::BFLOAT16:
-        {
-            min = PixelValue(bfloat16::lowest());
-            max = PixelValue(bfloat16::max());
-            break;
-        }
-        case DataType::F16:
-        {
-            min = PixelValue(std::numeric_limits<half>::lowest());
-            max = PixelValue(std::numeric_limits<half>::max());
-            break;
-        }
-        case DataType::F32:
-        {
-            min = PixelValue(std::numeric_limits<float>::lowest());
-            max = PixelValue(std::numeric_limits<float>::max());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Undefined data type!");
-    }
-    return std::make_tuple(min, max);
-}
-
-/** Return true if the given format has horizontal subsampling.
- *
- * @param[in] format Format to determine subsampling.
- *
- * @return True if the format can be subsampled horizontaly.
- */
-inline bool has_format_horizontal_subsampling(Format format)
-{
-    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
-}
-
-/** Return true if the given format has vertical subsampling.
- *
- * @param[in] format Format to determine subsampling.
- *
- * @return True if the format can be subsampled verticaly.
- */
-inline bool has_format_vertical_subsampling(Format format)
-{
-    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
-}
-
-/** Separate a 2D convolution into two 1D convolutions
- *
- * @param[in]  conv     2D convolution
- * @param[out] conv_col 1D vertical convolution
- * @param[out] conv_row 1D horizontal convolution
- * @param[in]  size     Size of the 2D convolution
- *
- * @return true if the separation was successful
- */
-inline bool separate_matrix(const int16_t *conv, int16_t *conv_col, int16_t *conv_row, uint8_t size)
-{
-    int32_t min_col     = -1;
-    int16_t min_col_val = -1;
-
-    for(int32_t i = 0; i < size; ++i)
-    {
-        if(conv[i] != 0 && (min_col < 0 || abs(min_col_val) > abs(conv[i])))
-        {
-            min_col     = i;
-            min_col_val = conv[i];
-        }
-    }
-
-    if(min_col < 0)
-    {
-        return false;
-    }
-
-    for(uint32_t j = 0; j < size; ++j)
-    {
-        conv_col[j] = conv[min_col + j * size];
-    }
-
-    for(uint32_t i = 0; i < size; i++)
-    {
-        if(static_cast<int>(i) == min_col)
-        {
-            conv_row[i] = 1;
-        }
-        else
-        {
-            int16_t coeff = conv[i] / conv[min_col];
-
-            for(uint32_t j = 1; j < size; ++j)
-            {
-                if(conv[i + j * size] != (conv_col[j] * coeff))
-                {
-                    return false;
-                }
-            }
-
-            conv_row[i] = coeff;
-        }
-    }
-
-    return true;
-}
-
-/** Calculate the scale of the given square matrix
- *
- * The scale is the absolute value of the sum of all the coefficients in the matrix.
- *
- * @note If the coefficients add up to 0 then the scale is set to 1.
- *
- * @param[in] matrix      Matrix coefficients
- * @param[in] matrix_size Number of elements per side of the square matrix. (Number of coefficients = matrix_size * matrix_size).
- *
- * @return The absolute value of the sum of the coefficients if they don't add up to 0, otherwise 1.
- */
-inline uint32_t calculate_matrix_scale(const int16_t *matrix, unsigned int matrix_size)
-{
-    const size_t size = matrix_size * matrix_size;
-
-    return std::max(1, std::abs(std::accumulate(matrix, matrix + size, 0)));
-}
-
-/** Adjust tensor shape size if width or height are odd for a given multi-planar format. No modification is done for other formats.
- *
- * @note Adding here a few links discussing the issue of odd size and sharing the same solution:
- *       <a href="https://android.googlesource.com/platform/frameworks/base/+/refs/heads/master/graphics/java/android/graphics/YuvImage.java">Android Source</a>
- *       <a href="https://groups.google.com/a/webmproject.org/forum/#!topic/webm-discuss/LaCKpqiDTXM">WebM</a>
- *       <a href="https://bugs.chromium.org/p/libyuv/issues/detail?id=198&amp;can=1&amp;q=odd%20width">libYUV</a>
- *       <a href="https://sourceforge.net/p/raw-yuvplayer/bugs/1/">YUVPlayer</a> *
- *
- * @param[in, out] shape  Tensor shape of 2D size
- * @param[in]      format Format of the tensor
- *
- * @return The adjusted tensor shape.
- */
-inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
-{
-    TensorShape output{ shape };
-
-    // Force width to be even for formats which require subsampling of the U and V channels
-    if(has_format_horizontal_subsampling(format))
-    {
-        output.set(0, output.x() & ~1U);
-    }
-
-    // Force height to be even for formats which require subsampling of the U and V channels
-    if(has_format_vertical_subsampling(format))
-    {
-        output.set(1, output.y() & ~1U);
-    }
-
-    return output;
-}
-
-/** Calculate subsampled shape for a given format and channel
- *
- * @param[in] shape   Shape of the tensor to calculate the extracted channel.
- * @param[in] format  Format of the tensor.
- * @param[in] channel Channel to create tensor shape to be extracted.
- *
- * @return The subsampled tensor shape.
- */
-inline TensorShape calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
-{
-    TensorShape output{ shape };
-
-    // Subsample shape only for U or V channel
-    if(Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
-    {
-        // Subsample width for the tensor shape when channel is U or V
-        if(has_format_horizontal_subsampling(format))
-        {
-            output.set(0, output.x() / 2U);
-        }
-
-        // Subsample height for the tensor shape when channel is U or V
-        if(has_format_vertical_subsampling(format))
-        {
-            output.set(1, output.y() / 2U);
-        }
-    }
-
-    return output;
-}
-
-/** Calculate accurary required by the horizontal and vertical convolution computations
- *
- * @param[in] conv_col Pointer to the vertical vector of the separated convolution filter
- * @param[in] conv_row Pointer to the horizontal vector of the convolution filter
- * @param[in] size     Number of elements per vector of the separated matrix
- *
- * @return The return type is a pair. The first element of the pair is the biggest data type needed for the first stage. The second
- * element of the pair is the biggest data type needed for the second stage.
- */
-inline std::pair<DataType, DataType> data_type_for_convolution(const int16_t *conv_col, const int16_t *conv_row, size_t size)
-{
-    DataType first_stage  = DataType::UNKNOWN;
-    DataType second_stage = DataType::UNKNOWN;
-
-    auto gez = [](const int16_t &v)
-    {
-        return v >= 0;
-    };
-
-    auto accu_neg = [](const int &first, const int &second)
-    {
-        return first + (second < 0 ? second : 0);
-    };
-
-    auto accu_pos = [](const int &first, const int &second)
-    {
-        return first + (second > 0 ? second : 0);
-    };
-
-    const bool only_positive_coefficients = std::all_of(conv_row, conv_row + size, gez) && std::all_of(conv_col, conv_col + size, gez);
-
-    if(only_positive_coefficients)
-    {
-        const int max_row_value = std::accumulate(conv_row, conv_row + size, 0) * UINT8_MAX;
-        const int max_value     = std::accumulate(conv_col, conv_col + size, 0) * max_row_value;
-
-        first_stage = (max_row_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
-
-        second_stage = (max_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
-    }
-    else
-    {
-        const int min_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_neg) * UINT8_MAX;
-        const int max_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_pos) * UINT8_MAX;
-        const int neg_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_neg);
-        const int pos_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_pos);
-        const int min_value      = neg_coeffs_sum * max_row_value + pos_coeffs_sum * min_row_value;
-        const int max_value      = neg_coeffs_sum * min_row_value + pos_coeffs_sum * max_row_value;
-
-        first_stage = ((INT16_MIN <= min_row_value) && (max_row_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
-
-        second_stage = ((INT16_MIN <= min_value) && (max_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
-    }
-
-    return std::make_pair(first_stage, second_stage);
-}
-
-/** Calculate the accuracy required by the squared convolution calculation.
- *
- *
- * @param[in] conv Pointer to the squared convolution matrix
- * @param[in] size The total size of the convolution matrix
- *
- * @return The return is the biggest data type needed to do the convolution
- */
-inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t size)
-{
-    auto gez = [](const int16_t v)
-    {
-        return v >= 0;
-    };
-
-    const bool only_positive_coefficients = std::all_of(conv, conv + size, gez);
-
-    if(only_positive_coefficients)
-    {
-        const int max_conv_value = std::accumulate(conv, conv + size, 0) * UINT8_MAX;
-        if(max_conv_value <= UINT16_MAX)
-        {
-            return DataType::U16;
-        }
-        else
-        {
-            return DataType::S32;
-        }
-    }
-    else
-    {
-        const int min_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
-        {
-            return b < 0 ? a + b : a;
-        })
-        * UINT8_MAX;
-
-        const int max_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
-        {
-            return b > 0 ? a + b : a;
-        })
-        * UINT8_MAX;
-
-        if((INT16_MIN <= min_value) && (INT16_MAX >= max_value))
-        {
-            return DataType::S16;
-        }
-        else
-        {
-            return DataType::S32;
-        }
-    }
-}
-
 /** Permutes the given dimensions according the permutation vector
  *
  * @param[in,out] dimensions Dimensions to be permuted.
@@ -911,7 +69,7 @@ template <typename T>
 inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = old_dim[i];
         dimensions.set(perm[i], dimension_val);
@@ -929,7 +87,11 @@ inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &
  *
  * @return PadStrideInfo for SAME padding
  */
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout = DataLayout::NCHW, const Size2D &dilation = Size2D(1u, 1u),
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout   = DataLayout::NCHW,
+                                 const Size2D                &dilation      = Size2D(1u, 1u),
                                  const DimensionRoundingType &rounding_type = DimensionRoundingType::FLOOR);
 
 /** Returns expected width and height of the deconvolution's output tensor.
@@ -942,8 +104,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info);
 
 /** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
@@ -957,11 +121,47 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation = Size2D(1U, 1U));
 
+/** Returns calculated width and height of output scaled tensor depending on dimensions rounding mode.
+ *
+ * @param[in] width           Width of input tensor (Number of columns)
+ * @param[in] height          Height of input tensor (Number of rows)
+ * @param[in] kernel_width    Kernel width.
+ * @param[in] kernel_height   Kernel height.
+ * @param[in] pad_stride_info Pad and stride information.
+ *
+ * @return A pair with the new width in the first position and the new height in the second, returned values can be < 1
+ */
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info);
+
+/** Returns calculated width, height and depth of output scaled tensor depending on dimensions rounding mode.
+ *
+ * @param[in] width         Width of input tensor
+ * @param[in] height        Height of input tensor
+ * @param[in] depth         Depth of input tensor
+ * @param[in] kernel_width  Kernel width.
+ * @param[in] kernel_height Kernel height.
+ * @param[in] kernel_depth  Kernel depth.
+ * @param[in] pool3d_info   Pad and stride and round information for 3d pooling
+ *
+ * @return A tuple with the new width in the first position, the new height in the second, and the new depth in the third.
+ *         Returned values can be < 1
+ */
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
+                                                      const Pooling3dLayerInfo &pool3d_info);
+
 /** Check if the given reduction operation should be handled in a serial way.
  *
  * @param[in] op   Reduction operation to perform
@@ -981,16 +181,6 @@ bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int
  */
 QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log);
 
-/** Returns resize ratio between input and output with consideration of aligned corners
- *
- * @param[in] input_size    The input size
- * @param[in] output_size   the output size
- * @param[in] align_corners True to align corners of input and output. Defaults to false.
- *
- * @return The ratio between input and output (i.e., the input size divided by the output size)
- */
-float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners = false);
-
 /** Returns a pair of minimum and maximum values for a quantized activation
  *
  * @param[in] act_info  The information for activation
@@ -999,15 +189,9 @@ float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_c
  *
  * @return The pair with minimum and maximum values
  */
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info);
-
-/** Convert a tensor format into a string.
- *
- * @param[in] format @ref Format to be translated to string.
- *
- * @return The string describing the format.
- */
-const std::string &string_from_format(Format format);
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info);
 
 /** Convert a channel identity into a string.
  *
@@ -1016,48 +200,7 @@ const std::string &string_from_format(Format format);
  * @return The string describing the channel.
  */
 const std::string &string_from_channel(Channel channel);
-/** Convert a data layout identity into a string.
- *
- * @param[in] dl @ref DataLayout to be translated to string.
- *
- * @return The string describing the data layout.
- */
-const std::string &string_from_data_layout(DataLayout dl);
-/** Convert a data type identity into a string.
- *
- * @param[in] dt @ref DataType to be translated to string.
- *
- * @return The string describing the data type.
- */
-const std::string &string_from_data_type(DataType dt);
-/** Convert a matrix pattern into a string.
- *
- * @param[in] pattern @ref MatrixPattern to be translated to string.
- *
- * @return The string describing the matrix pattern.
- */
-const std::string &string_from_matrix_pattern(MatrixPattern pattern);
-/** Translates a given activation function to a string.
- *
- * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
- *
- * @return The string describing the activation function.
- */
-const std::string &string_from_activation_func(ActivationLayerInfo::ActivationFunction act);
-/** Translates a given non linear function to a string.
- *
- * @param[in] function @ref NonLinearFilterFunction to be translated to string.
- *
- * @return The string describing the non linear function.
- */
-const std::string &string_from_non_linear_filter_function(NonLinearFilterFunction function);
-/** Translates a given interpolation policy to a string.
- *
- * @param[in] policy @ref InterpolationPolicy to be translated to string.
- *
- * @return The string describing the interpolation policy.
- */
-const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
+
 /** Translates a given border mode policy to a string.
  *
  * @param[in] border_mode @ref BorderMode to be translated to string.
@@ -1079,162 +222,67 @@ const std::string &string_from_norm_type(NormType type);
  * @return The string describing the pooling type.
  */
 const std::string &string_from_pooling_type(PoolingType type);
-/** Translates a given GEMMLowp output stage to a string.
- *
- * @param[in] output_stage @ref GEMMLowpOutputStageInfo to be translated to string.
- *
- * @return The string describing the GEMMLowp output stage
- */
-const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage);
-/** Convert a PixelValue to a string, represented through the specific data type
- *
- * @param[in] value     The PixelValue to convert
- * @param[in] data_type The type to be used to convert the @p value
+/** Check if the pool region is entirely outside the input tensor
  *
- * @return String representation of the PixelValue through the given data type.
- */
-std::string string_from_pixel_value(const PixelValue &value, const DataType data_type);
-/** Lower a given string.
+ * @param[in] info @ref PoolingLayerInfo to be checked.
  *
- * @param[in] val Given string to lower.
- *
- * @return The lowered string
+ * @return True if the pool region is entirely outside the input tensor, False otherwise.
  */
-std::string lower_string(const std::string &val);
-
-/** Check if a given data type is of floating point type
+bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info);
+/** Check if the 3d pool region is entirely outside the input tensor
  *
- * @param[in] dt Input data type.
+ * @param[in] info @ref Pooling3dLayerInfo to be checked.
  *
- * @return True if data type is of floating point type, else false.
+ * @return True if the pool region is entirely outside the input tensor, False otherwise.
  */
-inline bool is_data_type_float(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::F16:
-        case DataType::F32:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Check if a given data type is of quantized type
- *
- * @note Quantized is considered a super-set of fixed-point and asymmetric data types.
+bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info);
+/** Check if the 3D padding is symmetric i.e. padding in each opposite sides are euqal (left=right, top=bottom and front=back)
  *
- * @param[in] dt Input data type.
+ * @param[in] info @ref Padding3D input 3D padding object to check if it is symmetric
  *
- * @return True if data type is of quantized type, else false.
+ * @return True if padding is symmetric
  */
-inline bool is_data_type_quantized(DataType dt)
+inline bool is_symmetric(const Padding3D &info)
 {
-    switch(dt)
-    {
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-            return true;
-        default:
-            return false;
-    }
+    return ((info.left == info.right) && (info.top == info.bottom) && (info.front == info.back));
 }
-
-/** Check if a given data type is of asymmetric quantized type
+/** Translates a given GEMMLowp output stage to a string.
  *
- * @param[in] dt Input data type.
+ * @param[in] output_stage @ref GEMMLowpOutputStageInfo to be translated to string.
  *
- * @return True if data type is of asymmetric quantized type, else false.
+ * @return The string describing the GEMMLowp output stage
  */
-inline bool is_data_type_quantized_asymmetric(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QASYMM16:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Check if a given data type is of asymmetric quantized signed type
+const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage);
+/** Convert a PixelValue to a string, represented through the specific data type
  *
- * @param[in] dt Input data type.
+ * @param[in] value     The PixelValue to convert
+ * @param[in] data_type The type to be used to convert the @p value
  *
- * @return True if data type is of asymmetric quantized signed type, else false.
+ * @return String representation of the PixelValue through the given data type.
  */
-inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QASYMM8_SIGNED:
-            return true;
-        default:
-            return false;
-    }
-}
+std::string string_from_pixel_value(const PixelValue &value, const DataType data_type);
 
-/** Check if a given data type is of symmetric quantized type
+/** Stores padding information before configuring a kernel
  *
- * @param[in] dt Input data type.
+ * @param[in] infos list of tensor infos to store the padding info for
  *
- * @return True if data type is of symmetric quantized type, else false.
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
  */
-inline bool is_data_type_quantized_symmetric(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QSYMM8:
-        case DataType::QSYMM8_PER_CHANNEL:
-        case DataType::QSYMM16:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Check if a given data type is of per channel type
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensorInfo *> infos);
+/** Stores padding information before configuring a kernel
  *
- * @param[in] dt Input data type.
+ * @param[in] tensors list of tensors to store the padding info for
  *
- * @return True if data type is of per channel type, else false.
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
  */
-inline bool is_data_type_quantized_per_channel(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QSYMM8_PER_CHANNEL:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Create a string with the float in full precision.
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors);
+/** Check if the previously stored padding info has changed after configuring a kernel
  *
- * @param val Floating point value
+ * @param[in] padding_map an unordered map where each tensor info pointer is paired with its original padding info
  *
- * @return String with the floating point value.
+ * @return true if any of the tensor infos has changed its paddings
  */
-inline std::string float_to_string_with_full_precision(float val)
-{
-    std::stringstream ss;
-    ss.precision(std::numeric_limits<float>::max_digits10);
-    ss << val;
-
-    if(val != static_cast<int>(val))
-    {
-        ss << "f";
-    }
-
-    return ss.str();
-}
+bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map);
 
 /** Returns the number of elements required to go from start to end with the wanted step
  *
@@ -1250,67 +298,6 @@ inline size_t num_of_elements_in_range(const float start, const float end, const
     return size_t(std::ceil((end - start) / step));
 }
 
-/** Returns true if the value can be represented by the given data type
- *
- * @param[in] val   value to be checked
- * @param[in] dt    data type that is checked
- * @param[in] qinfo (Optional) quantization info if the data type is QASYMM8
- *
- * @return true if the data type can hold the value.
- */
-template <typename T>
-bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = QuantizationInfo())
-{
-    switch(dt)
-    {
-        case DataType::U8:
-        {
-            const auto val_u8 = static_cast<uint8_t>(val);
-            return ((val_u8 == val) && val_u8 >= std::numeric_limits<uint8_t>::lowest() && val_u8 <= std::numeric_limits<uint8_t>::max());
-        }
-        case DataType::QASYMM8:
-        {
-            double min = static_cast<double>(dequantize_qasymm8(0, qinfo));
-            double max = static_cast<double>(dequantize_qasymm8(std::numeric_limits<uint8_t>::max(), qinfo));
-            return ((double)val >= min && (double)val <= max);
-        }
-        case DataType::S8:
-        {
-            const auto val_s8 = static_cast<int8_t>(val);
-            return ((val_s8 == val) && val_s8 >= std::numeric_limits<int8_t>::lowest() && val_s8 <= std::numeric_limits<int8_t>::max());
-        }
-        case DataType::U16:
-        {
-            const auto val_u16 = static_cast<uint16_t>(val);
-            return ((val_u16 == val) && val_u16 >= std::numeric_limits<uint16_t>::lowest() && val_u16 <= std::numeric_limits<uint16_t>::max());
-        }
-        case DataType::S16:
-        {
-            const auto val_s16 = static_cast<int16_t>(val);
-            return ((val_s16 == val) && val_s16 >= std::numeric_limits<int16_t>::lowest() && val_s16 <= std::numeric_limits<int16_t>::max());
-        }
-        case DataType::U32:
-        {
-            const auto val_u32 = static_cast<uint32_t>(val);
-            return ((val_u32 == val) && val_u32 >= std::numeric_limits<uint32_t>::lowest() && val_u32 <= std::numeric_limits<uint32_t>::max());
-        }
-        case DataType::S32:
-        {
-            const auto val_s32 = static_cast<int32_t>(val);
-            return ((val_s32 == val) && val_s32 >= std::numeric_limits<int32_t>::lowest() && val_s32 <= std::numeric_limits<int32_t>::max());
-        }
-        case DataType::BFLOAT16:
-            return (val >= bfloat16::lowest() && val <= bfloat16::max());
-        case DataType::F16:
-            return (val >= std::numeric_limits<half>::lowest() && val <= std::numeric_limits<half>::max());
-        case DataType::F32:
-            return (val >= std::numeric_limits<float>::lowest() && val <= std::numeric_limits<float>::max());
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            return false;
-    }
-}
-
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Print consecutive elements to an output stream.
  *
@@ -1321,26 +308,27 @@ bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = Quantization
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
 template <typename T>
-void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
+void print_consecutive_elements_impl(
+    std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
 {
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
     std::ios stream_status(nullptr);
     stream_status.copyfmt(s);
 
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         // Set stream width as it is not a "sticky" stream manipulator
-        if(stream_width != 0)
+        if (stream_width != 0)
         {
             s.width(stream_width);
         }
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             s << std::right << static_cast<T>(ptr[i]) << element_delim;
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat16> returns false and then the print_type becomes int.
             s << std::right << float(ptr[i]) << element_delim;
@@ -1369,17 +357,17 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
 
     int max_width = -1;
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         std::stringstream ss;
         ss.copyfmt(s);
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             ss << static_cast<T>(ptr[i]);
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat> returns false and then the print_type becomes int.
             ss << float(ptr[i]);
@@ -1403,7 +391,12 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
  * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim = " ");
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim = " ");
 
 /** Identify the maximum width of n consecutive elements.
  *
@@ -1416,5 +409,5 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
  */
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n);
 #endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index bbea5e5575..5550560aff 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,12 @@
 #define ARM_COMPUTE_VALIDATE_H
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
 #include "arm_compute/core/IKernel.h"
-#include "arm_compute/core/IMultiHOG.h"
-#include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/utils/DataLayoutUtils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/FormatUtils.h"
 #include "arm_compute/core/Window.h"
 
 #include <algorithm>
@@ -50,9 +50,9 @@ namespace detail
 template <typename T>
 inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimensions<T> &dim2, unsigned int upper_dim)
 {
-    for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
+    for (unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
     {
-        if(dim1[i] != dim2[i])
+        if (dim1[i] != dim2[i])
         {
             return true;
         }
@@ -80,7 +80,7 @@ public:
      * @param[in] line     Source code line. Used for error reporting.
      */
     compare_dimension(const Dimensions<T> &dim, const char *function, const char *file, int line)
-        : _dim{ dim }, _function{ function }, _file{ file }, _line{ line }
+        : _dim{dim}, _function{function}, _file{file}, _line{line}
     {
     }
 
@@ -111,7 +111,7 @@ inline arm_compute::Status for_each_error(F &&)
 }
 
 template <typename F, typename T, typename... Ts>
-inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args)
+inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&...args)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
     ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
@@ -148,13 +148,11 @@ struct get_tensor_info_t<ITensorInfo *>
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&...pointers)
 {
-    const std::array<const void *, sizeof...(Ts)> pointers_array{ { std::forward<Ts>(pointers)... } };
-    bool has_nullptr = std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr)
-    {
-        return (ptr == nullptr);
-    });
+    const std::array<const void *, sizeof...(Ts)> pointers_array{{std::forward<Ts>(pointers)...}};
+    bool                                          has_nullptr =
+        std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr) { return (ptr == nullptr); });
     ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(has_nullptr, function, file, line, "Nullptr object!");
     return arm_compute::Status{};
 }
@@ -178,8 +176,8 @@ inline arm_compute::Status error_on_nullptr(const char *function, const char *fi
  *
  * @return Status
  */
-arm_compute::Status error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                 const Window &full, const Window &win);
+arm_compute::Status error_on_mismatching_windows(
+    const char *function, const char *file, const int line, const Window &full, const Window &win);
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
@@ -200,8 +198,8 @@ arm_compute::Status error_on_mismatching_windows(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                               const Window &full, const Window &sub);
+arm_compute::Status error_on_invalid_subwindow(
+    const char *function, const char *file, const int line, const Window &full, const Window &sub);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBWINDOW(f, s) \
@@ -220,12 +218,14 @@ arm_compute::Status error_on_invalid_subwindow(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                 const Window &full, const Window &window, const int dim);
+arm_compute::Status error_on_window_not_collapsable_at_dimension(
+    const char *function, const char *file, const int line, const Window &full, const Window &window, const int dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_ERROR_THROW_ON(                                           \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                                 \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 
 /** Return an error if the passed coordinates have too many dimensions.
  *
@@ -239,8 +239,8 @@ arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *fun
  *
  * @return Status
  */
-arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                        const Coordinates &pos, unsigned int max_dim);
+arm_compute::Status error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const Coordinates &pos, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
@@ -258,8 +258,8 @@ arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, co
  *
  * @return Status
  */
-arm_compute::Status error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                   const Window &win, unsigned int max_dim);
+arm_compute::Status error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const Window &win, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
@@ -277,16 +277,82 @@ arm_compute::Status error_on_window_dimensions_gte(const char *function, const c
  * @return Status
  */
 template <typename T, typename... Ts>
-arm_compute::Status error_on_mismatching_dimensions(const char *function, const char *file, int line,
-                                                    const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
+arm_compute::Status error_on_mismatching_dimensions(const char          *function,
+                                                    const char          *file,
+                                                    int                  line,
+                                                    const Dimensions<T> &dim1,
+                                                    const Dimensions<T> &dim2,
+                                                    Ts &&...dims)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2,
+                                                       std::forward<Ts>(dims)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+
+/** Return true if the given format has horizontal subsampling.
+ *
+ * @param[in] format Format to determine subsampling.
+ *
+ * @return True if the format can be subsampled horizontaly.
+ */
+inline bool has_format_horizontal_subsampling(Format format)
+{
+    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 ||
+            format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
+}
+
+/** Return true if the given format has vertical subsampling.
+ *
+ * @param[in] format Format to determine subsampling.
+ *
+ * @return True if the format can be subsampled verticaly.
+ */
+inline bool has_format_vertical_subsampling(Format format)
+{
+    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
+}
+
+/** Adjust tensor shape size if width or height are odd for a given multi-planar format. No modification is done for other formats.
+ *
+ * @note Adding here a few links discussing the issue of odd size and sharing the same solution:
+ *       <a href="https://android.googlesource.com/platform/frameworks/base/+/refs/heads/master/graphics/java/android/graphics/YuvImage.java">Android Source</a>
+ *       <a href="https://groups.google.com/a/webmproject.org/forum/#!topic/webm-discuss/LaCKpqiDTXM">WebM</a>
+ *       <a href="https://bugs.chromium.org/p/libyuv/issues/detail?id=198&amp;can=1&amp;q=odd%20width">libYUV</a>
+ *       <a href="https://sourceforge.net/p/raw-yuvplayer/bugs/1/">YUVPlayer</a> *
+ *
+ * @param[in, out] shape  Tensor shape of 2D size
+ * @param[in]      format Format of the tensor
+ *
+ * @return The adjusted tensor shape.
+ */
+inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
+{
+    TensorShape output{shape};
+
+    // Force width to be even for formats which require subsampling of the U and V channels
+    if (has_format_horizontal_subsampling(format))
+    {
+        output.set(0, (output.x() + 1) & ~1U);
+    }
+
+    // Force height to be even for formats which require subsampling of the U and V channels
+    if (has_format_vertical_subsampling(format))
+    {
+        output.set(1, (output.y() + 1) & ~1U);
+    }
+
+    return output;
+}
 
 /** Return an error if the passed tensor objects are not even.
  *
@@ -300,18 +366,20 @@ arm_compute::Status error_on_mismatching_dimensions(const char *function, const
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_even(const char *function, const char *file, int line,
-                                              const Format &format, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_even(
+    const char *function, const char *file, int line, const Format &format, const ITensor *tensor1, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
-    }),
-    function, file, line, "Tensor shape has odd dimensions");
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    {
+                        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
+                        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
+                    }),
+        function, file, line, "Tensor shape has odd dimensions");
     return arm_compute::Status{};
 }
 
@@ -320,6 +388,38 @@ arm_compute::Status error_on_tensors_not_even(const char *function, const char *
 #define ARM_COMPUTE_RETURN_ERROR_ON_TENSORS_NOT_EVEN(...) \
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensors_not_even(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
+/** Calculate subsampled shape for a given format and channel
+ *
+ * @param[in] shape   Shape of the tensor to calculate the extracted channel.
+ * @param[in] format  Format of the tensor.
+ * @param[in] channel Channel to create tensor shape to be extracted.
+ *
+ * @return The subsampled tensor shape.
+ */
+inline TensorShape
+calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
+{
+    TensorShape output{shape};
+
+    // Subsample shape only for U or V channel
+    if (Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
+    {
+        // Subsample width for the tensor shape when channel is U or V
+        if (has_format_horizontal_subsampling(format))
+        {
+            output.set(0, output.x() / 2U);
+        }
+
+        // Subsample height for the tensor shape when channel is U or V
+        if (has_format_vertical_subsampling(format))
+        {
+            output.set(1, output.y() / 2U);
+        }
+    }
+
+    return output;
+}
+
 /** Return an error if the passed tensor objects are not sub-sampled.
  *
  * @param[in] function Function in which the error occurred.
@@ -333,25 +433,32 @@ arm_compute::Status error_on_tensors_not_even(const char *function, const char *
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_subsampled(const char *function, const char *file, int line,
-                                                    const Format &format, const TensorShape &shape, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_subsampled(const char        *function,
+                                                    const char        *file,
+                                                    int                line,
+                                                    const Format      &format,
+                                                    const TensorShape &shape,
+                                                    const ITensor     *tensor1,
+                                                    Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const TensorShape sub2_shape = calculate_subsampled_shape(shape, format);
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2);
-    }),
-    function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
+    const TensorShape                                    sub2_shape = calculate_subsampled_shape(shape, format);
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    { return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2); }),
+        function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
     return arm_compute::Status{};
 }
 
 #define ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different shapes from the given dimension
  *
@@ -365,10 +472,15 @@ arm_compute::Status error_on_tensors_not_subsampled(const char *function, const
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
-    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)...);
+    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2,
+                                       std::forward<Ts>(tensor_infos)...);
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
  *
@@ -382,8 +494,12 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     return error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
 }
@@ -400,19 +516,28 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       unsigned int       upper_dim,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_2 == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    const std::array < const ITensorInfo *, 2 + sizeof...(Ts) > tensors_info_array{ { tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(), [&](const ITensorInfo * tensor_info)
-    {
-        return detail::have_different_dimensions((*tensors_info_array.cbegin())->tensor_shape(), tensor_info->tensor_shape(), upper_dim);
-    }),
-    function, file, line, "Tensors have different shapes");
+    const std::array<const ITensorInfo *, 2 + sizeof...(Ts)> tensors_info_array{
+        {tensor_info_1, tensor_info_2, tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    {
+                                                        return detail::have_different_dimensions(
+                                                            (*tensors_info_array.cbegin())->tensor_shape(),
+                                                            tensor_info->tensor_shape(), upper_dim);
+                                                    }),
+                                        function, file, line, "Tensors have different shapes");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
@@ -428,14 +553,20 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       unsigned int   upper_dim,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_2 == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
-                                                                           detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
+                                                   detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) \
@@ -454,19 +585,18 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataLayout &&tensor_data_layout = tensor_info->data_layout();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_layout() != tensor_data_layout;
-    }),
-    function, file, line, "Tensors have different data layouts");
+    DataLayout                                         &&tensor_data_layout = tensor_info->data_layout();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_layout() != tensor_data_layout; }),
+                                        function, file, line, "Tensors have different data layouts");
     return arm_compute::Status{};
 }
 /** Return an error if the passed tensors have different data layouts
@@ -480,19 +610,21 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(function, file, line, tensor->info(),
-                                                                                 detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                           \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                 \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different data types
  *
@@ -505,19 +637,18 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataType &&tensor_data_type = tensor_info->data_type();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_type() != tensor_data_type;
-    }),
-    function, file, line, "Tensors have different data types");
+    DataType                                           &&tensor_data_type = tensor_info->data_type();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_type() != tensor_data_type; }),
+                                        function, file, line, "Tensors have different data types");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different data types
@@ -531,19 +662,21 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(function, file, line, tensor->info(),
-                                                                               detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info
  *
@@ -559,28 +692,32 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char        *function,
+                                                                  const char        *file,
+                                                                  const int          line,
+                                                                  const ITensorInfo *tensor_info_1,
+                                                                  const ITensorInfo *tensor_info_2,
+                                                                  Ts... tensor_infos)
 {
     DataType             &&first_data_type         = tensor_info_1->data_type();
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    if(!is_data_type_quantized(first_data_type))
+    if (!is_data_type_quantized(first_data_type))
     {
         return arm_compute::Status{};
     }
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->data_type() != first_data_type;
-    }),
-    function, file, line, "Tensors have different asymmetric quantized data types");
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    }),
-    function, file, line, "Tensors have different quantization information");
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    { return tensor_info->data_type() != first_data_type; }),
+                                        function, file, line, "Tensors have different asymmetric quantized data types");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                    [&](const ITensorInfo *tensor_info)
+                    { return tensor_info->quantization_info() != first_quantization_info; }),
+        function, file, line, "Tensors have different quantization information");
 
     return arm_compute::Status{};
 }
@@ -598,17 +735,24 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char    *function,
+                                                                  const char    *file,
+                                                                  const int      line,
+                                                                  const ITensor *tensor_1,
+                                                                  const ITensor *tensor_2,
+                                                                  Ts... tensors)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
-                                                                                      detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
+                                                              detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                 \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                       \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
  *
@@ -620,8 +764,8 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @param[in] formats  (Optional) Further allowed formats.
  */
 template <typename T, typename F, typename... Fs>
-void error_on_format_not_in(const char *function, const char *file, const int line,
-                            const T *object, F &&format, Fs &&... formats)
+void error_on_format_not_in(
+    const char *function, const char *file, const int line, const T *object, F &&format, Fs &&...formats)
 {
     ARM_COMPUTE_ERROR_ON_LOC(object == nullptr, function, file, line);
 
@@ -630,17 +774,17 @@ void error_on_format_not_in(const char *function, const char *file, const int li
 
     ARM_COMPUTE_ERROR_ON_LOC(object_format == Format::UNKNOWN, function, file, line);
 
-    const std::array<F, sizeof...(Fs)> formats_array{ { std::forward<Fs>(formats)... } };
+    const std::array<F, sizeof...(Fs)> formats_array{{std::forward<Fs>(formats)...}};
     ARM_COMPUTE_UNUSED(formats_array);
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(object_format != format && std::none_of(formats_array.begin(), formats_array.end(), [&](const F & f)
-    {
-        return f == object_format;
-    }),
-    function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(
+        object_format != format &&
+            std::none_of(formats_array.begin(), formats_array.end(), [&](const F &f) { return f == object_format; }),
+        function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
     ARM_COMPUTE_UNUSED(function, format, file, line);
 }
-#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) \
+    ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
 
 /** Return an error if the data type of the passed tensor info does not match any of the data types provided.
  *
@@ -654,20 +798,19 @@ void error_on_format_not_in(const char *function, const char *file, const int li
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataType &tensor_dt = tensor_info->data_type(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
-    {
-        return d == tensor_dt;
-    }),
-    function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str());
+    const std::array<T, sizeof...(Ts)> dts_array{{std::forward<Ts>(dts)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T &d) { return d == tensor_dt; }),
+        function, file, line, "ITensor data type %s not supported by this kernel",
+        string_from_data_type(tensor_dt).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data type of the passed tensor does not match any of the data types provided.
@@ -682,11 +825,12 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \
@@ -706,20 +850,19 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataLayout &tensor_dl = tensor_info->data_layout(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dl == DataLayout::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dls_array{ { std::forward<Ts>(dls)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T & l)
-    {
-        return l == tensor_dl;
-    }),
-    function, file, line, "ITensor data layout %s not supported by this kernel", string_from_data_layout(tensor_dl).c_str());
+    const std::array<T, sizeof...(Ts)> dls_array{{std::forward<Ts>(dls)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T &l) { return l == tensor_dl; }),
+        function, file, line, "ITensor data layout %s not supported by this kernel",
+        string_from_data_layout(tensor_dl).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data layout of the passed tensor does not match any of the data layout provided.
@@ -734,17 +877,19 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                               \
+        ::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 
 /** Return an error if the data type or the number of channels of the passed tensor info does not match any of the data types and number of channels provided.
  *
@@ -759,12 +904,20 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char        *function,
+                                                             const char        *file,
+                                                             const int          line,
+                                                             const ITensorInfo *tensor_info,
+                                                             size_t             num_channels,
+                                                             T                &&dt,
+                                                             Ts &&...dts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
     const size_t tensor_nc = tensor_info->num_channels();
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line, "Number of channels %zu. Required number of channels %zu", tensor_nc, num_channels);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line,
+                                            "Number of channels %zu. Required number of channels %zu", tensor_nc,
+                                            num_channels);
     return arm_compute::Status{};
 }
 /** Return an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
@@ -780,17 +933,25 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char    *function,
+                                                             const char    *file,
+                                                             const int      line,
+                                                             const ITensor *tensor,
+                                                             size_t         num_channels,
+                                                             T            &&dt,
+                                                             Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels,
+                                                                  std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                  \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                        \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 
 /** Return an error if the data type of the passed tensor info is FP16 and FP16 extension is not supported by the device.
  *
@@ -802,12 +963,12 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported),
-                                        function, file, line, "FP16 not supported by the device");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported), function,
+                                        file, line, "FP16 not supported by the device");
     return arm_compute::Status{};
 }
 
@@ -821,11 +982,12 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensor *tensor, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
     return arm_compute::Status{};
 }
 
@@ -838,8 +1000,8 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensor *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensor *tensor);
 
 /** Return an error if the tensor info is not 2D.
  *
@@ -850,8 +1012,8 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensorInfo *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensorInfo *tensor);
 
 #define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t))
@@ -870,17 +1032,15 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_channel_not_in(const char *function, const char *file, const int line,
-                                                   T cn, T &&channel, Ts &&... channels)
+inline arm_compute::Status
+error_on_channel_not_in(const char *function, const char *file, const int line, T cn, T &&channel, Ts &&...channels)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
-    {
-        return f == cn;
-    }),
-    function, file, line);
+    const std::array<T, sizeof...(Ts)> channels_array{{std::forward<Ts>(channels)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(),
+                                                                  [&](const T &f) { return f == cn; }),
+                                    function, file, line);
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) \
@@ -898,35 +1058,13 @@ inline arm_compute::Status error_on_channel_not_in(const char *function, const c
  *
  * @return Status
  */
-arm_compute::Status error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                         Format fmt, Channel cn);
+arm_compute::Status
+error_on_channel_not_in_known_format(const char *function, const char *file, const int line, Format fmt, Channel cn);
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
 #define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
 
-/** Return an error if the @ref IMultiHOG container is invalid
- *
- * An @ref IMultiHOG container is invalid if:
- *
- * -# it is a nullptr
- * -# it doesn't contain models
- * -# it doesn't have the HOG data objects with the same phase_type, normalization_type and l2_hyst_threshold (if normalization_type == L2HYS_NORM)
- *
- * @param[in] function  Function in which the error occurred.
- * @param[in] file      Name of the file where the error occurred.
- * @param[in] line      Line on which the error occurred.
- * @param[in] multi_hog IMultiHOG container to validate
- *
- * @return Status
- */
-arm_compute::Status error_on_invalid_multi_hog(const char *function, const char *file, const int line,
-                                               const IMultiHOG *multi_hog);
-#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m))
-#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_MULTI_HOG(m) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m))
-
 /** Return an error if the kernel is not configured.
  *
  * @param[in] function Function in which the error occurred.
@@ -936,8 +1074,8 @@ arm_compute::Status error_on_invalid_multi_hog(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_unconfigured_kernel(const char *function, const char *file, const int line,
-                                                 const IKernel *kernel);
+arm_compute::Status
+error_on_unconfigured_kernel(const char *function, const char *file, const int line, const IKernel *kernel);
 #define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k))
 #define ARM_COMPUTE_RETURN_ERROR_ON_UNCONFIGURED_KERNEL(k) \
@@ -954,8 +1092,12 @@ arm_compute::Status error_on_unconfigured_kernel(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                               const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+arm_compute::Status error_on_invalid_subtensor(const char        *function,
+                                               const char        *file,
+                                               const int          line,
+                                               const TensorShape &parent_shape,
+                                               const Coordinates &coords,
+                                               const TensorShape &shape);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
@@ -971,11 +1113,16 @@ arm_compute::Status error_on_invalid_subtensor(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                            const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+arm_compute::Status error_on_invalid_subtensor_valid_region(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const ValidRegion &parent_valid_region,
+                                                            const ValidRegion &valid_region);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+    ARM_COMPUTE_ERROR_THROW_ON(                                     \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
-}
+    ARM_COMPUTE_RETURN_ON_ERROR(                                           \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_VALIDATE_H*/
diff --git a/arm_compute/core/Version.h b/arm_compute/core/Version.h
index be3f0264bb..44d400bad8 100644
--- a/arm_compute/core/Version.h
+++ b/arm_compute/core/Version.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,12 +27,12 @@
 #include <string>
 
 /* Macro utilities */
-#define STRINGIFY2(s) #s
-#define STRINGIFY(s) STRINGIFY2(s)
+#define ARM_COMPUTE_STRINGIFY2(s) #s
+#define ARM_COMPUTE_STRINGIFY(s)  ARM_COMPUTE_STRINGIFY2(s)
 
-#define ARM_COMPUTE_VERSION_STR          \
-    STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
-    "." STRINGIFY(ARM_COMPUTE_VERSION_MINOR) "." STRINGIFY(ARM_COMPUTE_VERSION_PATCH)
+#define ARM_COMPUTE_VERSION_STR                      \
+    ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
+    "." ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MINOR) "." ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_PATCH)
 
 namespace arm_compute
 {
@@ -45,4 +45,7 @@ namespace arm_compute
 std::string build_information();
 } // namespace arm_compute
 
+#undef ARM_COMPUTE_STRINGIFY
+#undef ARM_COMPUTE_STRINGIFY2
+
 #endif /* ARM_COMPUTE_LIBRARY_VERSION_H */
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index d6690d484a..e93d2863c9 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WINDOW_H
-#define ARM_COMPUTE_WINDOW_H
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_H
+#define ACL_ARM_COMPUTE_CORE_WINDOW_H
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/math/Math.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
 
 namespace arm_compute
 {
@@ -45,6 +45,10 @@ public:
     static constexpr size_t DimY = 1;
     /** Alias for dimension 2 also known as Z dimension */
     static constexpr size_t DimZ = 2;
+    /** Alias for dimension 3 also known as W dimension */
+    static constexpr size_t DimW = 3;
+    /** Alias for dimension 4 also known as V dimension */
+    static constexpr size_t DimV = 4;
 
     /** Default constructor: create a window containing a single element. */
     constexpr Window()
@@ -82,10 +86,10 @@ public:
          * @param[in] step  Step between two elements of the dimension when iterating.
          *
          */
-        constexpr Dimension(int start = 0, int end = 1, int step = 1)
-            : _start(start), _end(end), _step(step)
+        constexpr Dimension(int start = 0, int end = 1, int step = 1) : _start(start), _end(end), _step(step)
         {
         }
+        Dimension(const Dimension &d) = default;
         /** Default assignment operator to allow dimensions to be copied */
         Dimension &operator=(const Dimension &d) = default;
         /** Return the start of the dimension */
@@ -119,6 +123,17 @@ public:
         {
             _end = end;
         }
+        /** Check whether two Dimensions are equal.
+         *
+         * @param[in] lhs LHS Dimensions
+         * @param[in] rhs RHS Dimensions
+         *
+         * @return True if the Dimensions are the same.
+         */
+        friend bool operator==(const Dimension &lhs, const Dimension &rhs)
+        {
+            return (lhs._start == rhs._start) && (lhs._end == rhs._end) && (lhs._step == rhs._step);
+        }
 
     private:
         int _start; /**< Start of the dimension */
@@ -198,15 +213,17 @@ public:
      */
     void shift(size_t dimension, int shift_value);
 
-    /** Shift down all the dimensions of a window
+    /** Shift down all the dimensions of a window starting from the specified dimension.
      *
-     * i.e new_dims[n] = old_dims[n+shift_value].
+     * new_dims[i] = old_dims[i]             for all i < start_dim.
+     * new_dims[i] = old_dims[i+shift_value] for all i >= start_dim.
      *
      * @param[in] shift_value Number of dimensions to shift the window by.
+     * @param[in] start_dim   The dimension from which the dimensions start to shift.
      *
      * @return The window with the shifted dimensions.
      */
-    Window shift_dimensions(unsigned int shift_value) const;
+    Window shift_dimensions(unsigned int shift_value, unsigned int start_dim = 0) const;
 
     /** Adjust the start or end of a given dimension by the given value
      *
@@ -346,7 +363,6 @@ public:
     {
         return slide_window_slice<4>(slice);
     }
-
     /** Collapse the dimensions between @p first and @p last if possible.
      *
      * A dimension is collapsable if it starts from 0 and matches the corresponding dimension in the full_window
@@ -358,7 +374,8 @@ public:
      *
      * @return Collapsed window.
      */
-    Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
+    Window
+    collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
 
     /** Collapse the dimensions higher than @p first if possible.
      *
@@ -411,6 +428,14 @@ public:
      * @param[in] rhs Second window to swap.
      */
     friend void swap(Window &lhs, Window &rhs);
+    /** Check whether two Windows are equal.
+     *
+     * @param[in] lhs LHS window
+     * @param[in] rhs RHS window
+     *
+     * @return True if the given windows are the same.
+     */
+    friend bool operator==(const Window &lhs, const Window &rhs);
 
 private:
     /** First slice of the window
@@ -418,7 +443,7 @@ private:
      * @return The first slice of the window.
      */
     template <unsigned int window_dimension>
-    Window                 first_slice_window() const;
+    Window first_slice_window() const;
 
     /** Slide the passed window slice.
      *
@@ -437,4 +462,4 @@ private:
 };
 } // namespace arm_compute
 #include "Window.inl"
-#endif /*ARM_COMPUTE_WINDOW_H */
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_H
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 70c4f80ac2..0f7c4fbdd7 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_INL
+#define ACL_ARM_COMPUTE_CORE_WINDOW_INL
+
 namespace arm_compute
 {
 inline Window::Window(const Window &src)
     : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         set(i, src[i]);
         _is_broadcasted[i] = src.is_broadcasted(i);
@@ -65,32 +69,34 @@ inline bool Window::is_broadcasted(size_t dimension) const
     return _is_broadcasted[dimension];
 }
 
-inline Window Window::collapse_if_possible(const Window &full_window, const size_t first,
-                                           const size_t last, bool *has_collapsed) const
+inline Window Window::collapse_if_possible(const Window &full_window,
+                                           const size_t  first,
+                                           const size_t  last,
+                                           bool         *has_collapsed) const
 {
     Window collapsed(*this);
 
     bool is_collapsable = true;
     int  collapsed_end  = _dims[first].end();
 
-    for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+    for (size_t d = first + 1; is_collapsable && (d < last); ++d)
     {
         // The _dims's dimension must match the full _dims dimension to be collapsable:
-        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1)
-                         && (full_window[d].end() == _dims[d].end());
+        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1) &&
+                         (full_window[d].end() == _dims[d].end());
         collapsed_end *= _dims[d].end();
     }
 
-    if(is_collapsable)
+    if (is_collapsable)
     {
         collapsed._dims.at(first).set_end(collapsed_end);
-        for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+        for (size_t d = first + 1; is_collapsable && (d < last); ++d)
         {
             collapsed.set(d, Dimension());
         }
     }
 
-    if(has_collapsed != nullptr)
+    if (has_collapsed != nullptr)
     {
         *has_collapsed = is_collapsable;
     }
@@ -98,13 +104,21 @@ inline Window Window::collapse_if_possible(const Window &full_window, const size
     return collapsed;
 }
 
-inline Window Window::shift_dimensions(unsigned int shift_value) const
+inline Window Window::shift_dimensions(unsigned int shift_value, unsigned int start_dim) const
 {
     Window shifted_window;
-    for(size_t n = 0; n < (Coordinates::num_max_dimensions - shift_value); n++)
+    size_t n = 0;
+
+    for (; n < start_dim; ++n)
+    {
+        shifted_window.set(n, _dims[n]);
+    }
+
+    for (; n < (Coordinates::num_max_dimensions - shift_value); n++)
     {
         shifted_window.set(n, _dims[n + shift_value]);
     }
+
     return shifted_window;
 }
 
@@ -120,9 +134,9 @@ inline Window Window::collapse(const Window &full_window, const size_t first, co
 inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) const
 {
     Window broadcastWin(*this);
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
-        if(shape[d] <= 1)
+        if (shape[d] <= 1)
         {
             broadcastWin.set_broadcasted(d);
         }
@@ -142,7 +156,7 @@ inline void Window::adjust(size_t dimension, int adjust_value, bool is_at_start)
     ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
     Window::Dimension &d = _dims[dimension];
 
-    if(is_at_start)
+    if (is_at_start)
     {
         d = Window::Dimension(d.start() + adjust_value, d.end(), d.step());
     }
@@ -172,7 +186,7 @@ inline void Window::set_dimension_step(size_t dimension, int step)
 
 inline void Window::validate() const
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(_dims[i].end() < _dims[i].start());
         ARM_COMPUTE_ERROR_ON((_dims[i].step() != 0) && (((_dims[i].end() - _dims[i].start()) % _dims[i].step()) != 0));
@@ -193,21 +207,21 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 
     Window out;
 
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
-        if(d == dimension)
+        if (d == dimension)
         {
-            int start        = _dims[d].start();
-            int end          = _dims[d].end();
-            const int step   = _dims[d].step();
+            int       start = _dims[d].start();
+            int       end   = _dims[d].end();
+            const int step  = _dims[d].step();
 
             const int num_it = num_iterations(d);
             const int rem    = num_it % total;
-            int work         = num_it / total;
+            int       work   = num_it / total;
 
-            int it_start     = work * id;
+            int it_start = work * id;
 
-            if(int(id) < rem)
+            if (int(id) < rem)
             {
                 ++work;
                 it_start += id;
@@ -234,18 +248,18 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 template <unsigned int window_dimension>
 inline bool Window::slide_window_slice(Window &slice) const
 {
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         // Did we reach the end of this dimension?
         const int v = slice._dims[n].start() + 1;
 
-        if(v < _dims[n].end())
+        if (v < _dims[n].end())
         {
             // No: increment
             slice._dims[n] = Dimension(v, v + 1, 1);
 
             // Reset lower dimensions:
-            for(unsigned int lower = window_dimension; lower < n; ++lower)
+            for (unsigned int lower = window_dimension; lower < n; ++lower)
             {
                 slice._dims[lower] = Dimension(_dims[lower].start(), _dims[lower].start() + 1, 1);
             }
@@ -258,14 +272,14 @@ inline bool Window::slide_window_slice(Window &slice) const
 }
 
 template <unsigned int window_dimension>
-inline Window          Window::first_slice_window() const
+inline Window Window::first_slice_window() const
 {
     Window slice;
 
     std::copy_n(_dims.begin(), window_dimension, slice._dims.begin());
 
     //Initialise higher dimensions to be the first slice.
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         slice._dims[n] = Dimension(_dims[n].start(), _dims[n].start() + 1, 1);
     }
@@ -275,16 +289,16 @@ inline Window          Window::first_slice_window() const
 
 inline void Window::use_tensor_dimensions(const TensorShape &shape, size_t first_dimension)
 {
-    for(unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
+    for (unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
     {
-        set(n, Window::Dimension(0, std::max(shape[n], static_cast<uint32_t>(1))));
+        set(n, Window::Dimension(0, std::max(shape[n], static_cast<size_t>(1))));
     }
 }
 
 inline TensorShape Window::shape() const
 {
     TensorShape shape;
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         shape.set(d, (_dims[d].end() - _dims[d].start()) / _dims[d].step());
     }
@@ -294,7 +308,7 @@ inline TensorShape Window::shape() const
 inline size_t Window::num_iterations_total() const
 {
     size_t total = 1;
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
         total *= num_iterations(d);
     }
@@ -305,4 +319,11 @@ inline void swap(Window &lhs, Window &rhs)
 {
     lhs._dims.swap(rhs._dims);
 }
+
+inline bool operator==(const Window &lhs, const Window &rhs)
+{
+    return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted);
+}
 } // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_INL
diff --git a/arm_compute/core/WindowIterator.h b/arm_compute/core/WindowIterator.h
index e7d5334fa0..29302c410a 100644
--- a/arm_compute/core/WindowIterator.h
+++ b/arm_compute/core/WindowIterator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,10 +28,6 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
 
-//FIXME: Delete the "PRINTF" before the release. In the meantime it's probably going to be useful to debug
-//#define PRINTF printf
-#define PRINTF(...)
-
 namespace arm_compute
 {
 /** Convert an offset in window steps into absolute coordinates.
@@ -44,7 +40,7 @@ namespace arm_compute
 inline Coordinates convert_window_coord_to_position(const Window &w, const Coordinates &offset)
 {
     Coordinates position;
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         position.set(i, w[i].start() + offset[i] * w[i].step());
     }
@@ -168,16 +164,14 @@ public:
     template <typename M>
     void iterate_3D(M &&on_new_row_size)
     {
-        while(_end.z() != _position.z())
+        while (_end.z() != _position.z())
         {
-            PRINTF("New slice %d\n", _position.z());
             iterate_2D_internal(on_new_row_size, _w.x().end() - _w.x().step(), _w.y().end() - _w.y().step());
             _position[2] += _w.z().step();
             _position[1] = _w.y().start();
             _position[0] = _w.x().start();
         }
         // Left over:
-        PRINTF("Left over slice\n");
         iterate_2D(on_new_row_size);
     }
 
@@ -217,29 +211,25 @@ private:
     void iterate_2D_internal(M &&on_new_row_size, int end_x, int end_y)
     {
         //Is there more than one row to process ?
-        if(end_y == _position.y())
+        if (end_y == _position.y())
         {
-            // Single row:
-            PRINTF("Partial row only\n");
             // Both start and end belong to the same row:
             iterate_over_dim0(end_x + _w.x().step(), on_new_row_size);
         }
         else
         {
             // Do we start from the beginning of the row ?
-            if(_w.x().start() != _position.x())
+            if (_w.x().start() != _position.x())
             {
                 //Start in the middle of a row: process left-over X
-                PRINTF("Partial row first\n");
                 iterate_over_dim0(_w.x().end(), on_new_row_size);
                 _position[1] += _w.y().step();
             }
 
             //Middle rows
             bool no_leftover = end_x + _w.x().step() == _w.x().end();
-            if(no_leftover)
+            if (no_leftover)
             {
-                PRINTF("no left over\n");
                 //Switch to full row size:
                 on_new_row_size(_w[0].start(), _w.x().end());
                 // Shouldn't be possible to reach that point and not have at least one entire row to process
@@ -249,17 +239,14 @@ private:
             }
             else
             {
-                PRINTF("with left over\n");
                 // Are there full rows to process ?
-                if(_position[1] != end_y)
+                if (_position[1] != end_y)
                 {
-                    PRINTF("full rows\n");
                     //Switch to full row size:
                     on_new_row_size(_w[0].start(), _w.x().end());
                     iterate_over_dim1(end_y);
                 }
 
-                PRINTF("Final leftover\n");
                 //Leftover end x
                 _position[0] = _w.x().start();
                 iterate_over_dim0(end_x + _w.x().step(), on_new_row_size);
@@ -273,7 +260,7 @@ private:
      */
     void iterate_over_dim1(int end)
     {
-        for(; _position[1] != end; _position[1] += _w[1].step())
+        for (; _position[1] != end; _position[1] += _w[1].step())
         {
             _position[0] = _w[0].start();
             iterate_over_dim0(_w[0].end());
@@ -298,10 +285,9 @@ private:
      */
     void iterate_over_dim0(int end)
     {
-        PRINTF("X [%d, %d, %d]\n", _position.x(), end, _w[0].step());
         // Both start and end belong to the same row:
         ARM_COMPUTE_ERROR_ON(_position[0] > end);
-        for(; _position.x() < end; _position[0] += _w[0].step())
+        for (; _position.x() < end; _position[0] += _w[0].step())
         {
             _lambda_function(_position);
         }
@@ -323,9 +309,10 @@ private:
  * @return A WindowIterator object.
  */
 template <typename L>
-WindowIterator<L> create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
+WindowIterator<L>
+create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
 {
     return WindowIterator<L>(w, start, end, std::move(lambda_function));
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_WINDOW_ITERATOR_H*/
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
new file mode 100644
index 0000000000..63a3a1a1ec
--- /dev/null
+++ b/arm_compute/core/experimental/Types.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Memory type */
+enum TensorType : int32_t
+{
+    ACL_UNKNOWN = -1,
+    ACL_SRC_DST = 0,
+
+    // Src
+    ACL_SRC     = 0,
+    ACL_SRC_0   = 0,
+    ACL_SRC_1   = 1,
+    ACL_SRC_2   = 2,
+    ACL_SRC_3   = 3,
+    ACL_SRC_4   = 4,
+    ACL_SRC_5   = 5,
+    ACL_SRC_6   = 6,
+    ACL_SRC_END = 6,
+
+    // Dst
+    ACL_DST     = 30,
+    ACL_DST_0   = 30,
+    ACL_DST_1   = 31,
+    ACL_DST_2   = 32,
+    ACL_DST_END = 32,
+
+    // Aux
+    ACL_INT     = 50,
+    ACL_INT_0   = 50,
+    ACL_INT_1   = 51,
+    ACL_INT_2   = 52,
+    ACL_INT_3   = 53,
+    ACL_INT_4   = 54,
+    ACL_SRC_VEC = 256,
+    ACL_DST_VEC = 512,
+    ACL_INT_VEC = 1024,
+
+    // Aliasing Types
+    // Conv etc
+    ACL_BIAS = ACL_SRC_2,
+
+    // Gemm
+    ACL_VEC_ROW_SUM = ACL_SRC_3,
+    ACL_VEC_COL_SUM = ACL_SRC_4,
+    ACL_SHIFTS      = ACL_SRC_5,
+    ACL_MULTIPLIERS = ACL_SRC_6,
+};
+
+namespace experimental
+{
+enum class MemoryLifetime
+{
+    Temporary  = 0,
+    Persistent = 1,
+    Prepare    = 2,
+};
+struct MemoryInfo
+{
+    MemoryInfo() = default;
+
+    MemoryInfo(int slot, size_t size, size_t alignment = 0) noexcept : slot(slot), size(size), alignment(alignment)
+    {
+    }
+
+    MemoryInfo(int slot, MemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept
+        : slot(slot), lifetime(lifetime), size(size), alignment(alignment)
+    {
+    }
+
+    bool merge(int slot, size_t new_size, size_t new_alignment = 0) noexcept
+    {
+        if (slot != this->slot)
+        {
+            return false;
+        }
+
+        size      = std::max(size, new_size);
+        alignment = std::max(alignment, new_alignment);
+
+        return true;
+    }
+
+    int            slot{ACL_UNKNOWN};
+    MemoryLifetime lifetime{MemoryLifetime::Temporary};
+    size_t         size{0};
+    size_t         alignment{64};
+};
+
+using MemoryRequirements = std::vector<MemoryInfo>;
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h b/arm_compute/core/utils/ActivationFunctionUtils.h
index bb64bef1e9..c988efa256 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h
+++ b/arm_compute/core/utils/ActivationFunctionUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,29 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_QMOV_H
-#define ARM_COMPUTE_WRAPPER_QMOV_H
+#ifndef ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H
 
-#include <arm_neon.h>
+#include "arm_compute/core/Types.h"
 
-namespace arm_compute
-{
-namespace wrapper
-{
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
-vqmov(const int16x8_t &a)
-{
-    return vqmovun_s16(a);
-}
+#include <string>
 
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
-vqmov(const int16x8_t &a)
+namespace arm_compute
 {
-    return vqmovn_s16(a);
-}
-
-} // namespace wrapper
+/** Translates a given activation function to a string.
+ *
+ * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
+ *
+ * @return The string describing the activation function.
+ */
+const std::string &string_from_activation_func(const ActivationFunction &act);
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_QMOV_H */
+#endif /*ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H */
diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/utils/DataLayoutUtils.h
index c09972353c..61839c9f91 100644
--- a/arm_compute/core/NEON/INEKernel.h
+++ b/arm_compute/core/utils/DataLayoutUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_INEKERNEL_H
-#define ARM_COMPUTE_INEKERNEL_H
+#ifndef ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H
+#include "arm_compute/core/Types.h"
 
-#include "arm_compute/core/CPP/ICPPKernel.h"
+#include <string>
 
 namespace arm_compute
 {
-/** Common interface for all kernels implemented in NEON. */
-using INEKernel = ICPPKernel;
+/** Convert a data layout identity into a string.
+ *
+ * @param[in] dl @ref DataLayout to be translated to string.
+ *
+ * @return The string describing the data layout.
+ */
+const std::string &string_from_data_layout(DataLayout dl);
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_INEKERNEL_H */
+#endif /*ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H */
diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h
new file mode 100644
index 0000000000..6fabb19b64
--- /dev/null
+++ b/arm_compute/core/utils/DataTypeUtils.h
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) 2016-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H
+#define ACL_ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** The size in bytes of the data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t data_size_from_type(DataType data_type)
+{
+    switch (data_type)
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+        case DataType::BFLOAT16:
+        case DataType::F16:
+            return 2;
+        case DataType::F32:
+        case DataType::U32:
+        case DataType::S32:
+            return 4;
+        case DataType::F64:
+        case DataType::U64:
+        case DataType::S64:
+            return 8;
+        case DataType::SIZET:
+            return sizeof(size_t);
+        default:
+            ARM_COMPUTE_ERROR("Invalid data type");
+            return 0;
+    }
+}
+
+/** The size in bytes of the data type
+ *
+ * @param[in] dt Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t element_size_from_data_type(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::S8:
+        case DataType::U8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+        case DataType::BFLOAT16:
+        case DataType::F16:
+            return 2;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            return 4;
+        case DataType::U64:
+        case DataType::S64:
+            return 8;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+            return 0;
+    }
+}
+
+/** Return the data type used by a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline DataType data_type_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+        case Format::UV88:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return DataType::U8;
+        case Format::U16:
+            return DataType::U16;
+        case Format::S16:
+            return DataType::S16;
+        case Format::U32:
+            return DataType::U32;
+        case Format::S32:
+            return DataType::S32;
+        case Format::BFLOAT16:
+            return DataType::BFLOAT16;
+        case Format::F16:
+            return DataType::F16;
+        case Format::F32:
+            return DataType::F32;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Not supported data_type for given format");
+            return DataType::UNKNOWN;
+    }
+}
+
+/** Return the promoted data type of a given data type.
+ *
+ * @note If promoted data type is not supported an error will be thrown
+ *
+ * @param[in] dt Data type to get the promoted type of.
+ *
+ * @return Promoted data type
+ */
+inline DataType get_promoted_data_type(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::U8:
+            return DataType::U16;
+        case DataType::S8:
+            return DataType::S16;
+        case DataType::U16:
+            return DataType::U32;
+        case DataType::S16:
+            return DataType::S32;
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+        case DataType::BFLOAT16:
+        case DataType::F16:
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            ARM_COMPUTE_ERROR("Unsupported data type promotions!");
+        default:
+            ARM_COMPUTE_ERROR("Undefined data type!");
+    }
+    return DataType::UNKNOWN;
+}
+
+/** Compute the mininum and maximum values a data type can take
+ *
+ * @param[in] dt Data type to get the min/max bounds of
+ *
+ * @return A tuple (min,max) with the minimum and maximum values respectively wrapped in PixelValue.
+ */
+inline std::tuple<PixelValue, PixelValue> get_min_max(DataType dt)
+{
+    PixelValue min{};
+    PixelValue max{};
+    switch (dt)
+    {
+        case DataType::U8:
+        case DataType::QASYMM8:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()));
+            break;
+        }
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::max()));
+            break;
+        }
+        case DataType::U16:
+        case DataType::QASYMM16:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::max()));
+            break;
+        }
+        case DataType::S16:
+        case DataType::QSYMM16:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
+            break;
+        }
+        case DataType::U32:
+        {
+            min = PixelValue(std::numeric_limits<uint32_t>::lowest());
+            max = PixelValue(std::numeric_limits<uint32_t>::max());
+            break;
+        }
+        case DataType::S32:
+        {
+            min = PixelValue(std::numeric_limits<int32_t>::lowest());
+            max = PixelValue(std::numeric_limits<int32_t>::max());
+            break;
+        }
+        case DataType::BFLOAT16:
+        {
+            min = PixelValue(bfloat16::lowest());
+            max = PixelValue(bfloat16::max());
+            break;
+        }
+        case DataType::F16:
+        {
+            min = PixelValue(std::numeric_limits<half>::lowest());
+            max = PixelValue(std::numeric_limits<half>::max());
+            break;
+        }
+        case DataType::F32:
+        {
+            min = PixelValue(std::numeric_limits<float>::lowest());
+            max = PixelValue(std::numeric_limits<float>::max());
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Undefined data type!");
+    }
+    return std::make_tuple(min, max);
+}
+
+/** Convert a data type identity into a string.
+ *
+ * @param[in] dt @ref DataType to be translated to string.
+ *
+ * @return The string describing the data type.
+ */
+const std::string &string_from_data_type(DataType dt);
+
+/** Convert a string to DataType
+ *
+ * @param[in] name The name of the data type
+ *
+ * @return DataType
+ */
+DataType data_type_from_name(const std::string &name);
+
+/** Input Stream operator for @ref DataType
+ *
+ * @param[in]  stream    Stream to parse
+ * @param[out] data_type Output data type
+ *
+ * @return Updated stream
+ */
+inline ::std::istream &operator>>(::std::istream &stream, DataType &data_type)
+{
+    std::string value;
+    stream >> value;
+    data_type = data_type_from_name(value);
+    return stream;
+}
+
+/** Check if a given data type is of floating point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of floating point type, else false.
+ */
+inline bool is_data_type_float(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::F16:
+        case DataType::F32:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of quantized type
+ *
+ * @note Quantized is considered a super-set of fixed-point and asymmetric data types.
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of quantized type, else false.
+ */
+inline bool is_data_type_quantized(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of asymmetric quantized type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of asymmetric quantized type, else false.
+ */
+inline bool is_data_type_quantized_asymmetric(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QASYMM16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of asymmetric quantized signed type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of asymmetric quantized signed type, else false.
+ */
+inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QASYMM8_SIGNED:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of 8-bit asymmetric quantized signed type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of 8-bit asymmetric quantized signed type, else false.
+ */
+inline bool is_data_type_quantized_asymmetric_char(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QASYMM8:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of symmetric quantized type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of symmetric quantized type, else false.
+ */
+inline bool is_data_type_quantized_symmetric(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QSYMM8:
+        case DataType::QSYMM8_PER_CHANNEL:
+        case DataType::QSYMM16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of per channel type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of per channel type, else false.
+ */
+inline bool is_data_type_quantized_per_channel(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QSYMM8_PER_CHANNEL:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Returns true if the value can be represented by the given data type
+ *
+ * @param[in] val   value to be checked
+ * @param[in] dt    data type that is checked
+ * @param[in] qinfo (Optional) quantization info if the data type is QASYMM8
+ *
+ * @return true if the data type can hold the value.
+ */
+template <typename T>
+bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = QuantizationInfo())
+{
+    switch (dt)
+    {
+        case DataType::U8:
+        {
+            const auto val_u8 = static_cast<uint8_t>(val);
+            return ((val_u8 == val) && val >= std::numeric_limits<uint8_t>::lowest() &&
+                    val <= std::numeric_limits<uint8_t>::max());
+        }
+        case DataType::QASYMM8:
+        {
+            double min = static_cast<double>(dequantize_qasymm8(0, qinfo));
+            double max = static_cast<double>(dequantize_qasymm8(std::numeric_limits<uint8_t>::max(), qinfo));
+            return ((double)val >= min && (double)val <= max);
+        }
+        case DataType::S8:
+        {
+            const auto val_s8 = static_cast<int8_t>(val);
+            return ((val_s8 == val) && val >= std::numeric_limits<int8_t>::lowest() &&
+                    val <= std::numeric_limits<int8_t>::max());
+        }
+        case DataType::U16:
+        {
+            const auto val_u16 = static_cast<uint16_t>(val);
+            return ((val_u16 == val) && val >= std::numeric_limits<uint16_t>::lowest() &&
+                    val <= std::numeric_limits<uint16_t>::max());
+        }
+        case DataType::S16:
+        {
+            const auto val_s16 = static_cast<int16_t>(val);
+            return ((val_s16 == val) && val >= std::numeric_limits<int16_t>::lowest() &&
+                    val <= std::numeric_limits<int16_t>::max());
+        }
+        case DataType::U32:
+        {
+            const auto val_d64 = static_cast<double>(val);
+            const auto val_u32 = static_cast<uint32_t>(val);
+            return ((val_u32 == val_d64) && val_d64 >= std::numeric_limits<uint32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<uint32_t>::max());
+        }
+        case DataType::S32:
+        {
+            const auto val_d64 = static_cast<double>(val);
+            const auto val_s32 = static_cast<int32_t>(val);
+            return ((val_s32 == val_d64) && val_d64 >= std::numeric_limits<int32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<int32_t>::max());
+        }
+        case DataType::BFLOAT16:
+            return (val >= bfloat16::lowest() && val <= bfloat16::max());
+        case DataType::F16:
+            return (val >= std::numeric_limits<half>::lowest() && val <= std::numeric_limits<half>::max());
+        case DataType::F32:
+            return (val >= std::numeric_limits<float>::lowest() && val <= std::numeric_limits<float>::max());
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            return false;
+    }
+}
+
+/** Returns the suffix string of CPU kernel implementation names based on the given data type
+ *
+ * @param[in] data_type The data type the CPU kernel implemetation uses
+ *
+ * @return the suffix string of CPU kernel implementations
+ */
+inline std::string cpu_impl_dt(const DataType &data_type)
+{
+    std::string ret = "";
+
+    switch (data_type)
+    {
+        case DataType::F32:
+            ret = "fp32";
+            break;
+        case DataType::F16:
+            ret = "fp16";
+            break;
+        case DataType::U8:
+            ret = "u8";
+            break;
+        case DataType::S16:
+            ret = "s16";
+            break;
+        case DataType::S32:
+            ret = "s32";
+            break;
+        case DataType::QASYMM8:
+            ret = "qu8";
+            break;
+        case DataType::QASYMM8_SIGNED:
+            ret = "qs8";
+            break;
+        case DataType::QSYMM16:
+            ret = "qs16";
+            break;
+        case DataType::QSYMM8_PER_CHANNEL:
+            ret = "qp8";
+            break;
+        case DataType::BFLOAT16:
+            ret = "bf16";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported.");
+    }
+
+    return ret;
+}
+
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H
diff --git a/arm_compute/core/utils/FormatUtils.h b/arm_compute/core/utils/FormatUtils.h
new file mode 100644
index 0000000000..a8e96bd361
--- /dev/null
+++ b/arm_compute/core/utils/FormatUtils.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** The size in bytes of the pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline size_t pixel_size_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+            return 1;
+        case Format::U16:
+        case Format::S16:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::UV88:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        case Format::U32:
+        case Format::S32:
+        case Format::F32:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Undefined pixel size for given format");
+            return 0;
+    }
+}
+
+/** Return the plane index of a given channel given an input format.
+ *
+ * @param[in] format  Input format
+ * @param[in] channel Input channel
+ *
+ * @return The plane index of the specific channel of the specific format
+ */
+inline int plane_idx_from_channel(Format format, Channel channel)
+{
+    switch (format)
+    {
+        // Single planar formats have a single plane
+        case Format::U8:
+        case Format::U16:
+        case Format::S16:
+        case Format::U32:
+        case Format::S32:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::F32:
+        case Format::UV88:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 0;
+        // Multi planar formats
+        case Format::NV12:
+        case Format::NV21:
+        {
+            // Channel U and V share the same plane of format UV88
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                case Channel::V:
+                    return 1;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::IYUV:
+        case Format::YUV444:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the channel index of a given channel given an input format.
+ *
+ * @param[in] format  Input format
+ * @param[in] channel Input channel
+ *
+ * @return The channel index of the specific channel of the specific format
+ */
+inline int channel_idx_from_format(Format format, Channel channel)
+{
+    switch (format)
+    {
+        case Format::RGB888:
+        {
+            switch (channel)
+            {
+                case Channel::R:
+                    return 0;
+                case Channel::G:
+                    return 1;
+                case Channel::B:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::RGBA8888:
+        {
+            switch (channel)
+            {
+                case Channel::R:
+                    return 0;
+                case Channel::G:
+                    return 1;
+                case Channel::B:
+                    return 2;
+                case Channel::A:
+                    return 3;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::YUYV422:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 3;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::UYVY422:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 1;
+                case Channel::U:
+                    return 0;
+                case Channel::V:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::NV12:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 0;
+                case Channel::V:
+                    return 1;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::NV21:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 0;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::YUV444:
+        case Format::IYUV:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 0;
+                case Channel::V:
+                    return 0;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of planes for a given format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of planes for a given image format.
+ */
+inline size_t num_planes_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::U32:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::F32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 1;
+        case Format::NV12:
+        case Format::NV21:
+            return 2;
+        case Format::IYUV:
+        case Format::YUV444:
+            return 3;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of channels for a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of channels for a given image format.
+ */
+inline size_t num_channels_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+        case Format::U16:
+        case Format::S16:
+        case Format::U32:
+        case Format::S32:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::F32:
+            return 1;
+        // Because the U and V channels are subsampled
+        // these formats appear like having only 2 channels:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::UV88:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            return 0;
+    }
+}
+
+/** Convert a tensor format into a string.
+ *
+ * @param[in] format @ref Format to be translated to string.
+ *
+ * @return The string describing the format.
+ */
+const std::string &string_from_format(Format format);
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H */
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/utils/InterpolationPolicyUtils.h
index 3aff677385..8d4ae4321c 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/utils/InterpolationPolicyUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,21 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEFIXEDPOINT_H
-#define ARM_COMPUTE_NEFIXEDPOINT_H
+#ifndef ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H
 
-#include <arm_neon.h>
+#include "arm_compute/core/Types.h"
+
+#include <string>
 
 namespace arm_compute
 {
-/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
+/** Translates a given interpolation policy to a string.
  *
- * @param[in] a Float input vector
- * @param[in] b Float input vector
+ * @param[in] policy @ref InterpolationPolicy to be translated to string.
  *
- * @return The lane-by-lane maximum -> float32x4x2
+ * @return The string describing the interpolation policy.
  */
-float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b);
+const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
 } // namespace arm_compute
-#include "arm_compute/core/NEON/NEFixedPoint.inl"
-#endif /* ARM_COMPUTE_NEFIXEDPOINT_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H */
diff --git a/arm_compute/core/GLES_COMPUTE/GCHelpers.h b/arm_compute/core/utils/StringUtils.h
index b1a9ab32be..c13cbaa334 100644
--- a/arm_compute/core/GLES_COMPUTE/GCHelpers.h
+++ b/arm_compute/core/utils/StringUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,38 +21,45 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GCHELPERS_H
-#define ARM_COMPUTE_GCHELPERS_H
+#ifndef ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H
 
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Helpers.h"
-
-#include <set>
 #include <string>
+#include <vector>
 
 namespace arm_compute
 {
-// Forward declarations
-class GCCoreRuntimeContext;
+/** Lower a given string.
+ *
+ * @param[in] val Given string to lower.
+ *
+ * @return The lowered string
+ */
+std::string lower_string(const std::string &val);
 
-/** Max vector width of an GLES vector */
-static constexpr unsigned int max_gc_vector_width = 16;
+/** Raise a given string to upper case
+ *
+ * @param[in] val Given string to lower.
+ *
+ * @return The upper case string
+ */
+std::string upper_string(const std::string &val);
 
-/** Helper function to get the GPU target from GLES using GL_RENDERER enum
+/** Create a string with the float in full precision.
  *
- * @return the GPU target
+ * @param val Floating point value
+ *
+ * @return String with the floating point value.
  */
-GPUTarget get_target_from_device();
-/** Creates an GLES kernel
+std::string float_to_string_with_full_precision(float val);
+
+/** Join a sequence of strings with separator @p sep
  *
- * @param[in] ctx         A context to be used to create the GLES kernel.
- * @param[in] kernel_name The kernel name.
- * @param[in] build_opts  The build options to be used for the GLES kernel compilation.
+ * @param[in] strings Strings to join
+ * @param[in] sep     Separator to join consecutive strings in the sequence
  *
- * @return A GLES kernel
+ * @return std::string
  */
-GCKernel create_opengl_kernel(GCCoreRuntimeContext *ctx, const std::string &kernel_name, const std::set<std::string> &build_opts);
+std::string join(const std::vector<std::string> strings, const std::string &sep);
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GCHELPERS_H */
+#endif /*ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H */
diff --git a/arm_compute/core/utils/misc/CRTP.h b/arm_compute/core/utils/helpers/AdjustVecSize.h
index 037c69ab1d..842e3b57d6 100644
--- a/arm_compute/core/utils/misc/CRTP.h
+++ b/arm_compute/core/utils/helpers/AdjustVecSize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,35 +21,35 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_MISC_CRTP_H
-#define ARM_COMPUTE_MISC_CRTP_H
+#ifndef ARM_COMPUTE_UTILS_ADJUSTVECSIZE_H
+#define ARM_COMPUTE_UTILS_ADJUSTVECSIZE_H
+
+#include "arm_compute/core/Error.h"
 
 namespace arm_compute
 {
-namespace misc
-{
-/** Curiously recurring template pattern Interface */
-template <typename T, template <typename> class Type>
-struct CRTP
+/** Returns the adjusted vector size in case it is less than the input's first dimension, getting rounded down to its closest valid vector size
+ *
+ * @param[in] vec_size vector size to be adjusted
+ * @param[in] dim0     size of the first dimension
+ *
+ * @return the number of element processed along the X axis per thread
+ */
+inline unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
 {
-public:
-    /** Exact type */
-    using ExactType = T;
+    ARM_COMPUTE_ERROR_ON(vec_size > 16);
 
-protected:
-    const T &impl() const
+    if ((vec_size >= dim0) && (dim0 == 3))
     {
-        return static_cast<const T &>(*this);
+        return dim0;
     }
-    T &impl()
+
+    while (vec_size > dim0)
     {
-        return static_cast<T &>(*this);
+        vec_size >>= 1;
     }
 
-private:
-    CRTP() = default;
-    friend Type<T>;
-};
-} // namespace misc
+    return vec_size;
+}
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_CRTP_H */
+#endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/utils/helpers/bit_ops.h b/arm_compute/core/utils/helpers/bit_ops.h
deleted file mode 100644
index 6dbca179e7..0000000000
--- a/arm_compute/core/utils/helpers/bit_ops.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
-#define ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
-
-#include "arm_compute/core/utils/misc/Requires.h"
-
-#include <type_traits>
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace bit_ops
-{
-/** Checks if the idx-th bit is set in an integral type
- *
- * @param[in] v   Integral input
- * @param[in] idx Index of the bit to check
- *
- * @return True if the idx-th bit is set else false
- */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
-bool is_bit_set(T v, unsigned int idx)
-{
-    return (v & 1 << idx) != 0;
-}
-} // namespace bit_ops
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H */
diff --git a/arm_compute/core/utils/helpers/fft.h b/arm_compute/core/utils/helpers/fft.h
deleted file mode 100644
index b22bece73f..0000000000
--- a/arm_compute/core/utils/helpers/fft.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_FFT_H
-#define ARM_COMPUTE_UTILS_HELPERS_FFT_H
-
-#include <set>
-#include <vector>
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace fft
-{
-/** Decompose a given 1D input size using the provided supported factors.
- *
- * @param[in] N                 Input size to be decomposed.
- * @param[in] supported_factors Supported factors that can be used for decomposition.
- *
- * @return A vector with the stages of the decomposition. Will be empty if decomposition failed.
- */
-std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsigned int> &supported_factors);
-/** Calculate digit reverse index vector given fft size and the decomposed stages
- *
- * @param N          Input size to calculate digit reverse for
- * @param fft_stages A vector with the FFT decomposed stages
- *
- * @return A vector with the digit reverse indices. Will be empty if it failed.
- */
-std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vector<unsigned int> &fft_stages);
-} // namespace fft
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_FFT_H */
diff --git a/arm_compute/core/utils/helpers/float_ops.h b/arm_compute/core/utils/helpers/float_ops.h
deleted file mode 100644
index fceee2e3fe..0000000000
--- a/arm_compute/core/utils/helpers/float_ops.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
-#define ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace float_ops
-{
-union RawFloat
-{
-    /** Constructor
-     *
-     * @param[in] val Floating-point value
-     */
-    explicit RawFloat(float val)
-        : f32(val)
-    {
-    }
-    /** Extract sign of floating point number
-     *
-     * @return Sign of floating point number
-     */
-    int32_t sign() const
-    {
-        return i32 >> 31;
-    }
-    /** Extract exponent of floating point number
-     *
-     * @return Exponent of floating point number
-     */
-    int32_t exponent() const
-    {
-        return (i32 >> 23) & 0xFF;
-    }
-    /** Extract mantissa of floating point number
-     *
-     * @return Mantissa of floating point number
-     */
-    int32_t mantissa() const
-    {
-        return i32 & 0x007FFFFF;
-    }
-
-    int32_t i32;
-    float   f32;
-};
-
-/** Checks if two floating point numbers are equal given an allowed number of ULPs
- *
- * @param[in] a                First number to compare
- * @param[in] b                Second number to compare
- * @param[in] max_allowed_ulps (Optional) Number of allowed ULPs
- *
- * @return True if number is close else false
- */
-inline bool is_equal_ulps(float a, float b, int max_allowed_ulps = 0)
-{
-    RawFloat ra(a);
-    RawFloat rb(b);
-
-    // Check ULP distance
-    const int ulps = std::abs(ra.i32 - rb.i32);
-    return ulps <= max_allowed_ulps;
-}
-
-/** Checks if the input floating point number is 1.0f checking if the difference is within a range defined with epsilon
- *
- * @param[in] a       Input floating point number
- * @param[in] epsilon (Optional) Epsilon used to define the error bounds
- *
- * @return True if number is close to 1.0f
- */
-inline bool is_one(float a, float epsilon = 0.00001f)
-{
-    return std::abs(1.0f - a) <= epsilon;
-}
-
-/** Checks if the input floating point number is 0.0f checking if the difference is within a range defined with epsilon
- *
- * @param[in] a       Input floating point number
- * @param[in] epsilon (Optional) Epsilon used to define the error bounds
- *
- * @return True if number is close to 0.0f
- */
-inline bool is_zero(float a, float epsilon = 0.00001f)
-{
-    return std::abs(0.0f - a) <= epsilon;
-}
-} // namespace float_ops
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H */
diff --git a/arm_compute/core/utils/helpers/tensor_info.h b/arm_compute/core/utils/helpers/tensor_info.h
deleted file mode 100644
index da24e82f5a..0000000000
--- a/arm_compute/core/utils/helpers/tensor_info.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H
-#define ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H
-
-#include "arm_compute/core/ITensorInfo.h"
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace tensor_info
-{
-/** Checks if the quantization info of given tensors are different
- *
- * @param tensor_info_1 Tensor info of the first tensor
- * @param tensor_info_2 Tensor info of the second tensor
- * @param tensor_infos  Tensor infos of the rest tensors
- *
- * @return True if tensors have mismatching quantization info else false.
- */
-template <typename... Ts>
-inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
-{
-    const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
-
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    });
-}
-} // namespace tensor_info
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H */
diff --git a/arm_compute/core/utils/helpers/tensor_transform.h b/arm_compute/core/utils/helpers/tensor_transform.h
index 7e912a6f0a..7a61fa192a 100644
--- a/arm_compute/core/utils/helpers/tensor_transform.h
+++ b/arm_compute/core/utils/helpers/tensor_transform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,8 @@ int calculate_stride_on_index(int index, Coordinates strides);
  *
  * @return Absolute start position of a given index
  */
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
 
 /** Returns the absolute end position of a given index for a strided slice operation
  *
@@ -68,8 +69,13 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
  *
  * @return Absolute end position of a given index
  */
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index, Coordinates ends, Coordinates strides,
-                           int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask         = 0,
+                           int32_t     shrink_axis_mask = 0);
 
 /** Calculate start, end and stride coordinates for a strided slice
  *
@@ -87,8 +93,12 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
  * @return A tuple with <Start,End,Strides>
  */
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask       = 0,
+                                                                                 int32_t     end_mask         = 0,
+                                                                                 int32_t     shrink_axis_mask = 0);
 
 /** Computes output shape of strided slice
  *
@@ -109,9 +119,14 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
  *
  * @return The output tensor shape
  */
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0,
-                                               bool return_unshrinked = false);
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask        = 0,
+                                               int32_t     end_mask          = 0,
+                                               int32_t     shrink_axis_mask  = 0,
+                                               bool        return_unshrinked = false);
 
 /** Constructs end mask in case we want to perform a slice operation using the strided slice interface
  *
@@ -122,7 +137,7 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
  * @return End mask
  */
 int32_t construct_slice_end_mask(Coordinates ends);
-} // namespace tensor_tranform
+} // namespace tensor_transform
 } // namespace helpers
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_HELPERS_TENSOR_TRANSFORM_H */
diff --git a/arm_compute/core/utils/io/FileHandler.h b/arm_compute/core/utils/io/FileHandler.h
index ebc2ef06c1..615651d5b1 100644
--- a/arm_compute/core/utils/io/FileHandler.h
+++ b/arm_compute/core/utils/io/FileHandler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/logging/FilePrinter.h b/arm_compute/core/utils/logging/FilePrinter.h
index 73a5421ed4..a865aadddb 100644
--- a/arm_compute/core/utils/logging/FilePrinter.h
+++ b/arm_compute/core/utils/logging/FilePrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 #define ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 
-#include "arm_compute/core/utils/logging/IPrinter.h"
-
 #include "arm_compute/core/utils/io/FileHandler.h"
+#include "arm_compute/core/utils/logging/IPrinter.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/utils/logging/Helpers.h b/arm_compute/core/utils/logging/Helpers.h
index 341f944ddc..c3c2f0f0b8 100644
--- a/arm_compute/core/utils/logging/Helpers.h
+++ b/arm_compute/core/utils/logging/Helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_LOGGING_HELPERS_H
 
 #include "arm_compute/core/utils/logging/Types.h"
-#include "support/MemorySupport.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
@@ -46,10 +46,10 @@ namespace logging
  * @return The formatted string
  */
 template <typename... Ts>
-inline std::string string_with_format(const std::string &fmt, Ts &&... args)
+inline std::string string_with_format(const std::string &fmt, Ts &&...args)
 {
     size_t size     = support::cpp11::snprintf(nullptr, 0, fmt.c_str(), args...) + 1;
-    auto   char_str = support::cpp14::make_unique<char[]>(size);
+    auto   char_str = std::make_unique<char[]>(size);
     support::cpp11::snprintf(char_str.get(), size, fmt.c_str(), args...);
     return std::string(char_str.get(), char_str.get() + size - 1);
 }
diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h
index b6ede5853a..7fde4d9302 100644
--- a/arm_compute/core/utils/logging/IPrinter.h
+++ b/arm_compute/core/utils/logging/IPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,7 @@ class Printer
 {
 public:
     /** Default Constructor */
-    Printer() noexcept
-        : _mtx()
+    Printer() noexcept : _mtx()
     {
     }
     /** Prevent instances of this class from being copied */
diff --git a/arm_compute/core/utils/logging/LogMsgDecorators.h b/arm_compute/core/utils/logging/LogMsgDecorators.h
index 08abcb4519..66a8180e21 100644
--- a/arm_compute/core/utils/logging/LogMsgDecorators.h
+++ b/arm_compute/core/utils/logging/LogMsgDecorators.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,8 +63,7 @@ public:
      *
      * @param str Sting to append
      */
-    StringDecorator(const std::string &str)
-        : _str(str)
+    StringDecorator(const std::string &str) : _str(str)
     {
         _str = angle_wrap_value(str);
     }
@@ -103,7 +102,7 @@ private:
         auto time = std::chrono::system_clock::to_time_t(now);
 
         // TODO: use put_time for gcc > 4.9
-        char buf[100] = { 0 };
+        char buf[100] = {0};
         std::strftime(buf, sizeof(buf), "%d-%m-%Y %I:%M:%S", std::localtime(&time));
         return buf;
     }
diff --git a/arm_compute/core/utils/logging/Logger.h b/arm_compute/core/utils/logging/Logger.h
index 2bd467ae2b..608db39138 100644
--- a/arm_compute/core/utils/logging/Logger.h
+++ b/arm_compute/core/utils/logging/Logger.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,7 +88,7 @@ public:
      * @param[in] args      Message arguments
      */
     template <typename... Ts>
-    void log(LogLevel log_level, const std::string &fmt, Ts &&... args);
+    void log(LogLevel log_level, const std::string &fmt, Ts &&...args);
     /** Sets log level of the logger
      *
      * @warning Not thread-safe
@@ -159,11 +159,11 @@ private:
 };
 
 template <typename... Ts>
-inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&... args)
+inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&...args)
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
diff --git a/arm_compute/core/utils/logging/LoggerRegistry.h b/arm_compute/core/utils/logging/LoggerRegistry.h
index c1a182c1ae..4e52a10935 100644
--- a/arm_compute/core/utils/logging/LoggerRegistry.h
+++ b/arm_compute/core/utils/logging/LoggerRegistry.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/logging/Logger.h"
 #include "arm_compute/core/utils/logging/Printers.h"
 #include "arm_compute/core/utils/logging/Types.h"
+
 #include "support/Mutex.h"
 
 #include <memory>
@@ -54,8 +55,9 @@ public:
      * @param[in] log_level Logger's log level. Defaults to INFO
      * @param[in] printers  Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
-    void create_logger(const std::string &name, LogLevel log_level = LogLevel::INFO,
-                       const std::vector<std::shared_ptr<Printer>> &printers = { std::make_shared<StdPrinter>() });
+    void create_logger(const std::string                           &name,
+                       LogLevel                                     log_level = LogLevel::INFO,
+                       const std::vector<std::shared_ptr<Printer>> &printers  = {std::make_shared<StdPrinter>()});
     /** Remove a logger
      *
      * @param name Logger's name
@@ -74,16 +76,17 @@ public:
      * @param[in] printers  (Optional) Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
     void create_reserved_loggers(LogLevel                                     log_level = LogLevel::INFO,
-                                 const std::vector<std::shared_ptr<Printer>> &printers  = { std::make_shared<StdPrinter>() });
+                                 const std::vector<std::shared_ptr<Printer>> &printers  = {
+                                      std::make_shared<StdPrinter>()});
 
 private:
     /** Default constructor */
     LoggerRegistry();
 
 private:
-    arm_compute::Mutex _mtx;
+    arm_compute::Mutex                                       _mtx;
     std::unordered_map<std::string, std::shared_ptr<Logger>> _loggers;
-    static std::set<std::string> _reserved_loggers;
+    static std::set<std::string>                             _reserved_loggers;
 };
 } // namespace logging
 } // namespace arm_compute
diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h
index e4d9734792..4d5aa5fe2c 100644
--- a/arm_compute/core/utils/logging/Macros.h
+++ b/arm_compute/core/utils/logging/Macros.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,44 +30,71 @@
 
 #ifdef ARM_COMPUTE_LOGGING_ENABLED
 
+#ifdef __GNUC__
+inline std::string signature_name(const std::string &pretty_func)
+{
+    const auto scope_op = pretty_func.find("::");
+    const auto begin    = pretty_func.substr(0, scope_op).rfind(" ") + 1;
+    const auto end      = pretty_func.rfind("(") - begin;
+
+    return pretty_func.substr(begin, end) + "()";
+}
+#define ARM_COMPUTE_SIGNATURE_NAME signature_name(__PRETTY_FUNCTION__)
+#else /* __GNUC__ */
+#define ARM_COMPUTE_SIGNATURE_NAME (__func__)
+#endif /* __GNUC__ */
+
 #define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg)                                 \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             __logger->log(log_level, msg);                                               \
         }                                                                                \
-    } while(false)
+    } while (false)
+
+#define ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME(logger_name, log_level, msg)                   \
+    do                                                                                   \
+    {                                                                                    \
+        auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
+        if (__logger != nullptr)                                                         \
+        {                                                                                \
+            std::ostringstream s;                                                        \
+            s << ARM_COMPUTE_SIGNATURE_NAME << " : " << msg;                             \
+            __logger->log(log_level, s.str());                                           \
+        }                                                                                \
+    } while (false)
 
 #define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)                     \
     do                                                                                        \
     {                                                                                         \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name);      \
-        if(__logger != nullptr)                                                               \
+        if (__logger != nullptr)                                                              \
         {                                                                                     \
             size_t size     = ::snprintf(nullptr, 0, fmt, __VA_ARGS__) + 1;                   \
-            auto   char_str = support::cpp14::make_unique<char[]>(size);                      \
-            ::snprintf(char_str.get(), size, #fmt, __VA_ARGS__);                              \
+            auto   char_str = std::make_unique<char[]>(size);                                 \
+            ::snprintf(char_str.get(), size, fmt, __VA_ARGS__);                               \
             __logger->log(log_level, std::string(char_str.get(), char_str.get() + size - 1)); \
         }                                                                                     \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)                           \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             std::ostringstream s;                                                        \
             s << stream;                                                                 \
             __logger->log(log_level, s.str());                                           \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 
 #define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg)
+#define ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME(logger_name, log_level, msg)
 #define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)
 #define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)
 
diff --git a/arm_compute/core/utils/logging/Printers.h b/arm_compute/core/utils/logging/Printers.h
index e09880cc53..80493e7052 100644
--- a/arm_compute/core/utils/logging/Printers.h
+++ b/arm_compute/core/utils/logging/Printers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/logging/StdPrinter.h b/arm_compute/core/utils/logging/StdPrinter.h
index ea41ce2599..eb0e78ee84 100644
--- a/arm_compute/core/utils/logging/StdPrinter.h
+++ b/arm_compute/core/utils/logging/StdPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/logging/Types.h b/arm_compute/core/utils/logging/Types.h
index 838adf95b4..64c567b984 100644
--- a/arm_compute/core/utils/logging/Types.h
+++ b/arm_compute/core/utils/logging/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,8 +44,7 @@ enum class LogLevel
 struct LogMsg
 {
     /** Default constructor */
-    LogMsg()
-        : raw_(), log_level_(LogLevel::OFF)
+    LogMsg() : raw_(), log_level_(LogLevel::OFF)
     {
     }
     /** Construct a log message
@@ -53,8 +52,7 @@ struct LogMsg
      * @param[in] msg       Message to log.
      * @param[in] log_level Logging level. Default: OFF
      */
-    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF)
-        : raw_(msg), log_level_(log_level)
+    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF) : raw_(msg), log_level_(log_level)
     {
     }
 
diff --git a/arm_compute/core/utils/math/Math.h b/arm_compute/core/utils/math/Math.h
new file mode 100644
index 0000000000..e70337ba0f
--- /dev/null
+++ b/arm_compute/core/utils/math/Math.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_UTILS_MATH_H
+#define ARM_COMPUTE_UTILS_MATH_H
+
+namespace arm_compute
+{
+/** Calculate the rounded up quotient of val / m.
+ *
+ * @param[in] val Value to divide and round up.
+ * @param[in] m   Value to divide by.
+ *
+ * @return the result.
+ */
+template <typename S, typename T>
+constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
+{
+    return (val + m - 1) / m;
+}
+
+/** Computes the smallest number larger or equal to value that is a multiple of divisor.
+ *
+ * @param[in] value   Lower bound value
+ * @param[in] divisor Value to compute multiple of.
+ *
+ * @return the result.
+ */
+template <typename S, typename T>
+inline auto ceil_to_multiple(S value, T divisor) -> decltype(((value + divisor - 1) / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return DIV_CEIL(value, divisor) * divisor;
+}
+
+/** Computes the largest number smaller or equal to value that is a multiple of divisor.
+ *
+ * @param[in] value   Upper bound value
+ * @param[in] divisor Value to compute multiple of.
+ *
+ * @return the result.
+ */
+template <typename S, typename T>
+inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return (value / divisor) * divisor;
+}
+
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_UTILS_MATH_H */
diff --git a/arm_compute/core/utils/math/SafeOps.h b/arm_compute/core/utils/math/SafeOps.h
index 41bbb12e70..ef8bcf7e14 100644
--- a/arm_compute/core/utils/math/SafeOps.h
+++ b/arm_compute/core/utils/math/SafeOps.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,10 @@
 #define ARM_COMPUTE_UTILS_MATH_SAFE_OPS
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+
+#include "support/AclRequires.h"
+
+#include <limits>
 
 namespace arm_compute
 {
@@ -44,16 +47,16 @@ namespace math
  *
  * @return The addition result
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_add(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
+    if ((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
+    else if ((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -76,16 +79,16 @@ T safe_integer_add(T val_a, T val_b)
  *
  * @return The subtraction result
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_sub(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
+    if ((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
+    else if ((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -108,18 +111,18 @@ T safe_integer_sub(T val_a, T val_b)
  *
  * @return The multiplication result
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_mul(T val_a, T val_b)
 {
     T result = 0;
 
-    if(val_a > 0)
+    if (val_a > 0)
     {
-        if((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
+        if ((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if(val_b < (std::numeric_limits<T>::min() / val_a))
+        else if (val_b < (std::numeric_limits<T>::min() / val_a))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -130,11 +133,11 @@ T safe_integer_mul(T val_a, T val_b)
     }
     else
     {
-        if((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
+        if ((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
+        else if ((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -158,12 +161,12 @@ T safe_integer_mul(T val_a, T val_b)
  *
  * @return The quotient
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_div(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
+    if ((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -174,7 +177,7 @@ T safe_integer_div(T val_a, T val_b)
 
     return result;
 }
-} // namespace cast
+} // namespace math
 } // namespace utils
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_MATH_SAFE_OPS */
diff --git a/arm_compute/core/utils/misc/Cast.h b/arm_compute/core/utils/misc/Cast.h
deleted file mode 100644
index fc6246aace..0000000000
--- a/arm_compute/core/utils/misc/Cast.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_CAST_H
-#define ARM_COMPUTE_MISC_CAST_H
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace cast
-{
-/** Polymorphic cast between two types
- *
- * @warning Will throw an exception if cast cannot take place
- *
- * @tparam Target Target to cast type
- * @tparam Source Source from cast type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source>
-inline Target polymorphic_cast(Source *v)
-{
-    if(dynamic_cast<Target>(v) == nullptr)
-    {
-        ARM_COMPUTE_THROW(std::bad_cast());
-    }
-    return static_cast<Target>(v);
-}
-
-/** Polymorphic down cast between two types
- *
- * @warning Will assert if cannot take place
- *
- * @tparam Target Target to cast type
- * @tparam Source Source from cast type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source>
-inline Target polymorphic_downcast(Source *v)
-{
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<Target>(v) != static_cast<Target>(v));
-    return static_cast<Target>(v);
-}
-
-/** Polymorphic cast between two unique pointer types
- *
- * @warning Will throw an exception if cast cannot take place
- *
- * @tparam Target  Target to cast type
- * @tparam Source  Source from cast type
- * @tparam Deleter Deleter function type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source, typename Deleter>
-std::unique_ptr<Target, Deleter> polymorphic_cast_unique_ptr(std::unique_ptr<Source, Deleter> &&v)
-{
-    if(dynamic_cast<Target *>(v.get()) == nullptr)
-    {
-        ARM_COMPUTE_THROW(std::bad_cast());
-    }
-    auto r = static_cast<Target *>(v.release());
-    return std::unique_ptr<Target, Deleter>(r, std::move(v.get_deleter()));
-}
-
-/** Polymorphic down cast between two unique pointer types
- *
- * @warning Will assert if cannot take place
- *
- * @tparam Target  Target to cast type
- * @tparam Source  Source from cast type
- * @tparam Deleter Deleter function type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source, typename Deleter>
-std::unique_ptr<Target, Deleter> polymorphic_downcast_unique_ptr(std::unique_ptr<Source, Deleter> &&v)
-{
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<Target *>(v.get()) != static_cast<Target *>(v.get()));
-    auto r = static_cast<Target *>(v.release());
-    return std::unique_ptr<Target, Deleter>(r, std::move(v.get_deleter()));
-}
-} // namespace cast
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_CAST_H */
diff --git a/arm_compute/core/utils/misc/ICloneable.h b/arm_compute/core/utils/misc/ICloneable.h
deleted file mode 100644
index 064f408201..0000000000
--- a/arm_compute/core/utils/misc/ICloneable.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_ICLONEABLE_H
-#define ARM_COMPUTE_MISC_ICLONEABLE_H
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace misc
-{
-/** Clonable Interface */
-template <class T>
-class ICloneable
-{
-public:
-    /** Default virtual desctructor */
-    virtual ~ICloneable() = default;
-    /** Provide a clone of the current object of class T
-     *
-     * @return Clone object of class T
-     */
-    virtual std::unique_ptr<T> clone() const = 0;
-};
-} // namespace misc
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_ICLONEABLE_H */
diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h
index c6ee7c9031..1d1b4ea8d7 100644
--- a/arm_compute/core/utils/misc/InfoHelpers.h
+++ b/arm_compute/core/utils/misc/InfoHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,10 +53,12 @@ inline bool is_relu(ActivationLayerInfo activation_info)
  */
 inline bool is_relu6(ActivationLayerInfo activation_info)
 {
-    const bool is_lu_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                    && activation_info.a() == 6.f && activation_info.b() == 0.f;
-    const bool is_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                 && activation_info.a() == 6.f;
+    const bool is_lu_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU &&
+        activation_info.a() == 6.f && activation_info.b() == 0.f;
+    const bool is_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        activation_info.a() == 6.f;
     return activation_info.enabled() && (is_lu_bounded_relu || is_bounded_relu);
 }
 
@@ -68,50 +70,52 @@ inline bool is_relu6(ActivationLayerInfo activation_info)
  *
  */
 template <typename T>
-inline void build_lstm_params_tensor_info(const LSTMParams<T>     &lstm_params,
-                                          LSTMParams<ITensorInfo> *lstm_params_info)
+inline void build_lstm_params_tensor_info(const LSTMParams<T> &lstm_params, LSTMParams<ITensorInfo> *lstm_params_info)
 {
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(),
+                                              lstm_params.cell_to_output_weights()->info());
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.projection_weights());
-        lstm_params_info->set_projection_params(lstm_params.projection_weights()->info(),
-                                                lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
+        lstm_params_info->set_projection_params(
+            lstm_params.projection_weights()->info(),
+            lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
     }
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(),
+                                     lstm_params.input_gate_bias());
 
-        const ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
-        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
-                                          cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
+        ITensorInfo *cell_to_input_weights_info =
+            (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
+        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(),
+                                          lstm_params.recurrent_to_input_weights()->info(), cell_to_input_weights_info,
+                                          lstm_params.input_gate_bias()->info());
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
-                                     lstm_params.output_layer_norm_weights(),
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.output_layer_norm_weights(),
                                      lstm_params.cell_layer_norm_weights());
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
         }
 
-        const ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
-        const ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
-        const ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
-        const ITensorInfo *input_info  = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
+        ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
+        ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
+        ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
+        ITensorInfo *input_info = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
 
         lstm_params_info->set_layer_normalization_params(input_info, forget_info, cell_info, output_info);
     }
 
-    lstm_params_info->set_matmul_scale_params(lstm_params.input_intermediate_scale(),
-                                              lstm_params.forget_intermediate_scale(),
-                                              lstm_params.cell_intermediate_scale(),
-                                              lstm_params.output_intermediate_scale());
+    lstm_params_info->set_matmul_scale_params(
+        lstm_params.input_intermediate_scale(), lstm_params.forget_intermediate_scale(),
+        lstm_params.cell_intermediate_scale(), lstm_params.output_intermediate_scale());
 
     lstm_params_info->set_hidden_state_params(lstm_params.hidden_state_zero(), lstm_params.hidden_state_scale());
 }
diff --git a/arm_compute/core/utils/misc/Iterable.h b/arm_compute/core/utils/misc/Iterable.h
deleted file mode 100644
index 829c4b44a8..0000000000
--- a/arm_compute/core/utils/misc/Iterable.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_ITERABLE_H
-#define ARM_COMPUTE_MISC_ITERABLE_H
-
-#include <iterator>
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace iterable
-{
-/** Reverse range iterable class
- *
- * @tparam T Type to create a reverse range on
- */
-template <typename T>
-class reverse_iterable
-{
-public:
-    /** Default constructor
-     *
-     * @param[in] it Value to reverse iterate on
-     */
-    explicit reverse_iterable(T &it)
-        : _it(it)
-    {
-    }
-
-    /** Get beginning of iterator.
-     *
-     * @return beginning of iterator.
-     */
-    typename T::reverse_iterator begin()
-    {
-        return _it.rbegin();
-    }
-
-    /** Get end of iterator.
-     *
-     * @return end of iterator.
-     */
-    typename T::reverse_iterator end()
-    {
-        return _it.rend();
-    }
-
-    /** Get beginning of const iterator.
-     *
-     * @return beginning of const iterator.
-     */
-    typename T::const_reverse_iterator cbegin()
-    {
-        return _it.rbegin();
-    }
-
-    /** Get end of const iterator.
-     *
-     * @return end of const iterator.
-     */
-    typename T::const_reverse_iterator cend()
-    {
-        return _it.rend();
-    }
-
-private:
-    T &_it;
-};
-
-/** Creates a reverse iterable for a given type
- *
- * @tparam T Type to create a reverse iterable on
- *
- * @param[in] val Iterable input
- *
- * @return Reverse iterable container
- */
-template <typename T>
-reverse_iterable<T> reverse_iterate(T &val)
-{
-    return reverse_iterable<T>(val);
-}
-} // namespace iterable
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_ITERABLE_H */
diff --git a/arm_compute/core/utils/misc/MMappedFile.h b/arm_compute/core/utils/misc/MMappedFile.h
index 7669c5cc96..3efdbc5bda 100644
--- a/arm_compute/core/utils/misc/MMappedFile.h
+++ b/arm_compute/core/utils/misc/MMappedFile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_MISC_MMAPPED_FILE_H
 #define ARM_COMPUTE_MISC_MMAPPED_FILE_H
 
-#if !defined(BARE_METAL)
+#if !defined(_WIN64) && !defined(BARE_METAL)
 
 #include <string>
 #include <utility>
@@ -105,6 +105,6 @@ private:
 } // namespace mmap_io
 } // namespace utils
 } // namespace arm_compute
-#endif // !defined(BARE_METAL)
+#endif // !defined(_WIN64) &&!defined(BARE_METAL)
 
 #endif /* ARM_COMPUTE_MISC_MMAPPED_FILE_H */
diff --git a/arm_compute/core/utils/misc/Macros.h b/arm_compute/core/utils/misc/Macros.h
index 6e8d7659ee..fa861fa442 100644
--- a/arm_compute/core/utils/misc/Macros.h
+++ b/arm_compute/core/utils/misc/Macros.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,15 +26,16 @@
 
 #if defined(__cplusplus) && (__cplusplus >= 201402L)
 
-#define ARM_COMPUTE_DEPRECATED [[deprecated]]
-#define ARM_COMPUTE_DEPRECATED_REL(rel) [[deprecated("Deprecated in : " #rel)]]
+#define ARM_COMPUTE_DEPRECATED                           [[deprecated]]
+#define ARM_COMPUTE_DEPRECATED_REL(rel)                  [[deprecated("Deprecated in : " #rel)]]
 #define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) [[deprecated("Deprecated in : " #rel " - Use : " #replace)]]
 
 #elif defined(__GNUC__) || defined(__clang__)
 
-#define ARM_COMPUTE_DEPRECATED __attribute__((deprecated))
+#define ARM_COMPUTE_DEPRECATED          __attribute__((deprecated))
 #define ARM_COMPUTE_DEPRECATED_REL(rel) __attribute__((deprecated("Deprecated in : " #rel)))
-#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
+#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) \
+    __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
 
 #else // defined(__cplusplus) && (__cplusplus >= 201402L)
 
diff --git a/arm_compute/core/utils/misc/Random.h b/arm_compute/core/utils/misc/Random.h
deleted file mode 100644
index 9f5a128546..0000000000
--- a/arm_compute/core/utils/misc/Random.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_RANDOM_H
-#define ARM_COMPUTE_MISC_RANDOM_H
-
-#include "arm_compute/core/Error.h"
-
-#include <random>
-#include <type_traits>
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace random
-{
-/** Uniform distribution within a given number of sub-ranges
- *
- * @tparam T Distribution primitive type
- */
-template <typename T>
-class RangedUniformDistribution
-{
-public:
-    using DT = typename std::conditional<std::is_integral<T>::value,
-          std::uniform_int_distribution<T>,
-          std::uniform_real_distribution<float>>::type;
-    using result_type = T;
-    using range_pair  = std::pair<result_type, result_type>;
-
-public:
-    /** Constructor
-     *
-     * @param[in] low            lowest value in the range (inclusive)
-     * @param[in] high           highest value in the range (inclusive for uniform_int_distribution, exclusive for uniform_real_distribution)
-     * @param[in] exclude_ranges Ranges to exclude from the generator
-     */
-    RangedUniformDistribution(result_type low, result_type high, const std::vector<range_pair> &exclude_ranges)
-        : _distributions(), _selector()
-    {
-        result_type clow = low;
-        for(const auto &erange : exclude_ranges)
-        {
-            result_type epsilon = std::is_integral<result_type>::value ? 1 : static_cast<result_type>(std::numeric_limits<float>::epsilon());
-
-            ARM_COMPUTE_ERROR_ON(clow > erange.first || clow >= erange.second);
-
-            _distributions.emplace_back(DT(clow, erange.first - epsilon));
-            clow = erange.second + epsilon;
-        }
-        ARM_COMPUTE_ERROR_ON(clow > high);
-        _distributions.emplace_back(DT(clow, high));
-        _selector = std::uniform_int_distribution<uint32_t>(0, _distributions.size() - 1);
-    }
-    /** Generate random number
-     *
-     * @tparam URNG Random number generator object type
-     *
-     * @param[in] g A uniform random number generator object, used as the source of randomness.
-     *
-     * @return A new random number.
-     */
-    template <class URNG>
-    result_type operator()(URNG &g)
-    {
-        unsigned int rand_select = _selector(g);
-        return _distributions[rand_select](g);
-    }
-
-private:
-    std::vector<DT>                         _distributions;
-    std::uniform_int_distribution<uint32_t> _selector;
-};
-} // namespace random
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_RANDOM_H */
diff --git a/arm_compute/core/utils/misc/Requires.h b/arm_compute/core/utils/misc/Requires.h
deleted file mode 100644
index 33c6fa3096..0000000000
--- a/arm_compute/core/utils/misc/Requires.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_REQUIRES_H
-#define ARM_COMPUTE_UTILS_REQUIRES_H
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace requires
-{
-// *INDENT-OFF*
-// clang-format off
-namespace detail
-{
-enum class enabler
-{
-};
-} // namespace arm_compute
-
-/** Requirements as template */
-#define REQUIRES_T(...) template <bool Cond = (__VA_ARGS__), typename std::enable_if<Cond, int>::type = 0>
-/** Requirements as template argument */
-#define REQUIRES_TA(...) typename = typename std::enable_if<(__VA_ARGS__), arm_compute::utils::requires::detail::enabler>::type
-// clang-format on
-// *INDENT-ON*
-} // namespace requires
-} // namespace utils
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_UTILS_REQUIRES_H */
diff --git a/arm_compute/core/utils/misc/Rounding.h b/arm_compute/core/utils/misc/Rounding.h
deleted file mode 100644
index 650137a473..0000000000
--- a/arm_compute/core/utils/misc/Rounding.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_ROUNDING_H
-#define ARM_COMPUTE_UTILS_ROUNDING_H
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Requires.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "support/ToolchainSupport.h"
-
-#include <cmath>
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace rounding
-{
-/** Rounding mode */
-enum class RoundingMode
-{
-    TO_ZERO,             /**< Round towards zero */
-    AWAY_FROM_ZERO,      /**< Round away from zero */
-    HALF_TO_ZERO,        /**< Round half towards from zero */
-    HALF_AWAY_FROM_ZERO, /**< Round half away from zero */
-    HALF_UP,             /**< Round half towards positive infinity */
-    HALF_DOWN,           /**< Round half towards negative infinity */
-    HALF_EVEN            /**< Round half towards nearest even */
-};
-
-/** Round floating-point value with round to zero
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_to_zero(T value)
-{
-    T res = std::floor(std::fabs(value));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with round away from zero
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_away_from_zero(T value)
-{
-    T res = std::ceil(std::fabs(value));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with half value rounding towards zero.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_to_zero(T value)
-{
-    T res = T(std::ceil(std::fabs(value) - 0.5f));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with half value rounding away from zero.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_away_from_zero(T value)
-{
-    T res = T(std::floor(std::fabs(value) + 0.5f));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with half value rounding to positive infinity.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_up(T value)
-{
-    return std::floor(value + 0.5f);
-}
-
-/** Round floating-point value with half value rounding to negative infinity.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_down(T value)
-{
-    return std::ceil(value - 0.5f);
-}
-
-/** Round floating-point value with half value rounding to nearest even.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value   floating-point value to be rounded.
- * @param[in] epsilon precision.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_even(T value, T epsilon = std::numeric_limits<T>::epsilon())
-{
-    T positive_value = std::abs(value);
-    T ipart          = 0;
-    std::modf(positive_value, &ipart);
-    // If 'value' is exactly halfway between two integers
-    if(std::abs(positive_value - (ipart + 0.5f)) < epsilon)
-    {
-        // If 'ipart' is even then return 'ipart'
-        if(std::fmod(ipart, 2.f) < epsilon)
-        {
-            return support::cpp11::copysign(ipart, value);
-        }
-        // Else return the nearest even integer
-        return support::cpp11::copysign(std::ceil(ipart + 0.5f), value);
-    }
-    // Otherwise use the usual round to closest
-    return support::cpp11::copysign(support::cpp11::round(positive_value), value);
-}
-
-/** Round floating-point value given a rounding mode
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value         floating-point value to be rounded.
- * @param[in] rounding_mode Rounding mode to use.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round(T value, RoundingMode rounding_mode)
-{
-    switch(rounding_mode)
-    {
-        case RoundingMode::TO_ZERO:
-            return round_to_zero(value);
-        case RoundingMode::AWAY_FROM_ZERO:
-            return round_away_from_zero(value);
-        case RoundingMode::HALF_TO_ZERO:
-            return round_half_to_zero(value);
-        case RoundingMode::HALF_AWAY_FROM_ZERO:
-            return round_half_away_from_zero(value);
-        case RoundingMode::HALF_UP:
-            return round_half_up(value);
-        case RoundingMode::HALF_DOWN:
-            return round_half_down(value);
-        case RoundingMode::HALF_EVEN:
-            return round_half_even(value);
-        default:
-            ARM_COMPUTE_ERROR("Unsupported rounding mode!");
-    }
-}
-} // namespace rounding
-} // namespace utils
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_UTILS_ROUNDING_H */
diff --git a/arm_compute/core/utils/misc/SaturateCast.h b/arm_compute/core/utils/misc/SaturateCast.h
deleted file mode 100644
index 0241c64b14..0000000000
--- a/arm_compute/core/utils/misc/SaturateCast.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
-#define ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
-
-#include "arm_compute/core/utils/misc/Rounding.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace cast
-{
-// *INDENT-OFF*
-// clang-format off
-// same type
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_same<T, U>::value, int >::type = 0 >
-T saturate_cast(U v)
-{
-    return v;
-}
-
-// signed -> signed widening/same_width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) >= sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-// signed -> signed narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(utility::clamp<U>(v, std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max()));
-}
-
-// unsigned -> signed widening
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 (sizeof(T) > sizeof(U)),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-// unsigned -> signed narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(std::min<U>(v, std::numeric_limits<T>::max()));
-}
-// unsigned -> signed same_width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) == sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(std::min<U>(v, std::numeric_limits<T>::max()));
-}
-
-// signed -> unsigned widening/same width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_unsigned<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) >= sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(std::max<U>(0, v));
-}
-
-// signed -> unsigned narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_unsigned<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(utility::clamp<U>(v, 0, std::numeric_limits<T>::max()));
-}
-
-// unsigned -> unsigned widening/same width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<T>() &&
-                                 std::is_unsigned<U>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) >= sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-
-// unsigned -> unsigned narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<T>() &&
-                                 std::is_unsigned<U>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(utility::clamp<U>(v, std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max()));
-}
-
-// float -> int
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 traits::is_floating_point<U>::value,
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    int32_t vi = utils::rounding::round_half_away_from_zero(v);
-    return saturate_cast<T>(vi);
-}
-
-// int -> float
-template<typename T,
-         typename U,
-         typename std::enable_if<traits::is_floating_point<T>::value &&
-                                 std::is_integral<U>::value,
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-
-// float -> float
-template<typename T,
-        typename U,
-        typename std::enable_if<traits::is_floating_point<T>::value &&
-                                traits::is_floating_point<U>::value,
-                int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-// clang-format on
-// *INDENT-ON*
-} // namespace cast
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index dfccec8b37..e97d81390e 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H
-#define ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H
+#ifndef ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
+#define ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Utils.h"
-
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
 
 #include <cmath>
 
@@ -47,28 +48,35 @@ namespace shape_calculator
  *
  * @return the calculated shape
  */
-inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
+inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims)
 {
     const int   reduction_ops = reduction_axis.num_dimensions();
     Coordinates axis_local    = reduction_axis;
-    const int   input_dims    = input->info()->num_dimensions();
+    const int   input_dims    = input->num_dimensions();
     convert_negative_axis(axis_local, input_dims);
-    TensorShape out_shape = input->info()->tensor_shape();
+    TensorShape out_shape = input->tensor_shape();
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(int i = 0; i < reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+        for (int i = 0; i < reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_local[i] - i);
+            out_shape.remove_dimension(axis_local[i] - i, false);
         }
         return out_shape;
     }
     else
     {
-        for(int i = 0; i < reduction_ops; ++i)
+        for (int i = 0; i < reduction_ops; ++i)
         {
             out_shape.set(axis_local[i], 1);
         }
@@ -84,7 +92,10 @@ inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates
  *
  * @return the calculated shape
  */
-inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h, const DataLayout &data_layout)
+inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input,
+                                                         size_t             conv_w,
+                                                         size_t             conv_h,
+                                                         const DataLayout  &data_layout)
 {
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -126,10 +137,12 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
     const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0), "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0), "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0),
+                             "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0),
+                             "The height of the input tensor must be a multiple of stride");
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     output_shape.set(idx_width, output_shape[idx_width] / stride);
     output_shape.set(idx_height, output_shape[idx_height] / stride);
@@ -146,7 +159,8 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
  *
  * @return the calculated shape of the reshaped weights
  */
-inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
+inline TensorShape
+compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
 {
     // Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
@@ -154,14 +168,14 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
     ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
 
     // Calculate output shape
-    TensorShape weights_reshaped{ weights.tensor_shape() };
+    TensorShape weights_reshaped{weights.tensor_shape()};
     weights_reshaped.set(3, weights_reshaped[3] / num_groups);
 
     weights_reshaped.collapse(3);
     const size_t tmp_dim = weights_reshaped[0];
     weights_reshaped.set(0, weights_reshaped[1]);
     weights_reshaped.set(1, tmp_dim + (has_bias ? 1 : 0));
-    if(weights.num_dimensions() < 5)
+    if (weights.num_dimensions() < 5)
     {
         weights_reshaped.set(2, num_groups);
     }
@@ -177,7 +191,9 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false)
+inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo       &a,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d = false)
 {
     ARM_COMPUTE_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_ERROR_ON(lhs_info.k0 == 0);
@@ -198,11 +214,11 @@ inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLH
     const unsigned int output_width  = block_size * num_horiz_blocks * lhs_info.v0;
     const unsigned int output_height = std::ceil(num_vert_blocks / static_cast<float>(lhs_info.v0));
 
-    TensorShape lhs_shape{ a.tensor_shape() };
+    TensorShape lhs_shape{a.tensor_shape()};
     lhs_shape.set(0, output_width);
     lhs_shape.set(1, output_height);
 
-    if((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
+    if ((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
     {
         // When the data format is NHWC and the shapes are Nx1x1
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
@@ -242,7 +258,7 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
     const unsigned int output_width  = block_size * num_vert_blocks * rhs_info.h0;
     const unsigned int output_height = std::ceil(num_horiz_blocks / static_cast<float>(rhs_info.h0));
 
-    TensorShape rhs_shape{ a.tensor_shape() };
+    TensorShape rhs_shape{a.tensor_shape()};
     rhs_shape.set(0, output_width);
     rhs_shape.set(1, output_height);
 
@@ -257,14 +273,15 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
  *
  * @return the calculated shape
  */
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
+inline TensorShape
+compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
 {
     // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
     ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
     const int   interleave_width = 4 * mult_interleave4x4_height;
-    TensorShape shape_interleaved_a{ a.tensor_shape() };
+    TensorShape shape_interleaved_a{a.tensor_shape()};
     shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         const int M      = a.dimension(1) * a.dimension(2);
         const int height = std::ceil(M / static_cast<float>(interleave_width));
@@ -274,7 +291,7 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
         // To avoid failures by removing a dimension that doesn't exist
         // check if the number of dimensions is greater than 2.
-        if(shape_interleaved_a.num_dimensions() > 2)
+        if (shape_interleaved_a.num_dimensions() > 2)
         {
             shape_interleaved_a.remove_dimension(2);
         }
@@ -287,30 +304,6 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
     return shape_interleaved_a;
 }
 
-/** Calculate the reshaped shape of the weights to use in depthwise convolution
- *
- * @param[in] input Input tensor info
- * @param[in] info  Depthwise convolution information to be used for reshaping.
- *
- * @return the calculated shape
- */
-inline TensorShape compute_reshaped_depthwise_weights_shape(const ITensorInfo &input, const DepthwiseConvolutionReshapeInfo &info)
-{
-    const auto  data_layout = input.data_layout();
-    TensorShape weights_shape{};
-
-    const int    width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int    channel_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const size_t num_channels = input.dimension(channel_idx);
-    const size_t num_rows     = input.dimension(height_idx);
-    const size_t num_cols     = input.dimension(width_idx);
-
-    weights_shape.set(0, num_rows * num_cols * info.c0);
-    weights_shape.set(1, DIV_CEIL(num_channels, info.c0));
-    return weights_shape;
-}
-
 /** Calculate the transposed 1xW shape
  *
  * @param[in] b Input tensor info
@@ -320,7 +313,7 @@ inline TensorShape compute_reshaped_depthwise_weights_shape(const ITensorInfo &i
 inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
 {
     // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-    TensorShape shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape shape_transposed1xW_b{b.tensor_shape()};
     shape_transposed1xW_b.set(0, b.dimension(1) * 16);
     shape_transposed1xW_b.set(1, std::ceil(b.dimension(0) / 16.f));
 
@@ -340,7 +333,7 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
     //       The transpose1xW output matrix will have the following shape:
     //       [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
     ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
-    TensorShape  shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape  shape_transposed1xW_b{b.tensor_shape()};
     const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
     shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
     shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
@@ -356,8 +349,8 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
  */
 inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
 {
-    TensorShape shape_vector_sum_col{ b.tensor_shape() };
-    if(shape_vector_sum_col.num_dimensions() > 1)
+    TensorShape shape_vector_sum_col{b.tensor_shape()};
+    if (shape_vector_sum_col.num_dimensions() > 1)
     {
         shape_vector_sum_col.remove_dimension(1);
     }
@@ -373,9 +366,9 @@ inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
  */
 inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
 {
-    TensorShape shape_vector_sum_row{ a.tensor_shape() };
+    TensorShape shape_vector_sum_row{a.tensor_shape()};
     shape_vector_sum_row.set(Window::DimX, a.dimension(1));
-    if(shape_vector_sum_row.num_dimensions() > 1)
+    if (shape_vector_sum_row.num_dimensions() > 1)
     {
         shape_vector_sum_row.remove_dimension(1);
     }
@@ -392,7 +385,10 @@ inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
  *
  * @return the calculated shape
  */
-inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &convolved_dims, bool batch_size_on_z, unsigned int num_groups = 1)
+inline TensorShape compute_col2im_shape(const ITensorInfo &input,
+                                        const Size2D      &convolved_dims,
+                                        bool               batch_size_on_z,
+                                        unsigned int       num_groups = 1)
 {
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
     ARM_COMPUTE_ERROR_ON(input.tensor_shape()[1] != (convolved_dims.area()));
@@ -403,10 +399,10 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape col2im_shape{ input.tensor_shape() };
+    TensorShape col2im_shape{input.tensor_shape()};
     // If batches start on 3rd dimension shift dimensions right by 1 to retain upper tensor shape,
     // as first three will be override by H,W,C data
-    if(batch_size_on_z && num_groups == 1)
+    if (batch_size_on_z && num_groups == 1)
     {
         col2im_shape.shift_right(1);
     }
@@ -425,29 +421,27 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
  */
 inline TensorShape compute_transposed_shape(const ITensorInfo &input)
 {
-    TensorShape shape_transposed{ input.tensor_shape() };
+    TensorShape shape_transposed{input.tensor_shape()};
 
-    shape_transposed.set(0, input.dimension(1));
-    shape_transposed.set(1, input.dimension(0));
+    shape_transposed.set(0, input.dimension(1), false);
+    shape_transposed.set(1, input.dimension(0), false);
 
     return shape_transposed;
 }
 
 /** Calculate the depthwise convolution output shape of a tensor
  *
- * @param[in] input            Input tensor info
- * @param[in] weights          Weights tensor info
- * @param[in] conv_info        Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth.
- * @param[in] dilation         Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] input   Input tensor info
+ * @param[in] weights Weights tensor info
+ * @param[in] info    Convolution info
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation = Size2D(1U,
-                                                       1U))
+inline TensorShape
+compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -455,23 +449,54 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     const DataLayout weights_data_layout = weights.data_layout();
-    const int        weights_width_idx   = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
-    const int        weights_height_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
+    const int weights_width_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
+    const int weights_height_idx = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
 
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape[width_idx], input_shape[height_idx],
-                                                              weights_shape[weights_width_idx], weights_shape[weights_height_idx],
-                                                              conv_info, dilation);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_shape[width_idx], input_shape[height_idx], weights_shape[weights_width_idx],
+                          weights_shape[weights_height_idx], info.pad_stride_info, info.dilation);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(width_idx, output_width);
     output_shape.set(height_idx, output_height);
-    output_shape.set(channel_idx, input_shape[channel_idx] * depth_multiplier);
+    output_shape.set(channel_idx, input_shape[channel_idx] * info.depth_multiplier);
 
     return output_shape;
 }
 
+/** Calculate padding required for deconvolution
+ *
+ * @param[in] input    Input tensor info
+ * @param[in] weights  Weights tensor shape
+ * @param[in] sx       Stride on x axis
+ * @param[in] sy       Stride on y axis
+ * @param[in] out_dims Output shape dimensions
+ *
+ * @return the padding required
+ */
+inline std::pair<int32_t, int32_t> compute_deconvolution_padding(const ITensorInfo            &input,
+                                                                 const ITensorInfo            &weights,
+                                                                 int32_t                       sx,
+                                                                 int32_t                       sy,
+                                                                 std::pair<uint32_t, uint32_t> out_dims)
+{
+    const DataLayout data_layout = input.data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Find the upsampled dimensions
+    int32_t out_x = (static_cast<int32_t>(input.dimension(idx_w)) - 1) * sx + 1;
+    int32_t out_y = (static_cast<int32_t>(input.dimension(idx_h)) - 1) * sy + 1;
+
+    // Find the padding needed for the convolution with stride 1 in order to match output shape
+    int32_t padx = out_dims.first - (out_x - static_cast<int32_t>(weights.dimension(idx_w)) + 1);
+    int32_t pady = out_dims.second - (out_y - static_cast<int32_t>(weights.dimension(idx_h)) + 1);
+
+    return std::make_pair(padx, pady);
+}
+
 /** Calculate the upsampled output shape used for deconvolution
  *
  * @param[in] input    Input tensor info
@@ -484,20 +509,28 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy,
-                                                         std::pair<unsigned int, unsigned int> &out_dims, uint32_t &padx, uint32_t &pady)
+inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo                     &input,
+                                                         const ITensorInfo                     &weights,
+                                                         unsigned int                           sx,
+                                                         unsigned int                           sy,
+                                                         std::pair<unsigned int, unsigned int> &out_dims,
+                                                         uint32_t                              &padx,
+                                                         uint32_t                              &pady)
 {
+    // Find the padding needed for the convolution with stride 1 in order to match output shape
+    const auto padxy =
+        compute_deconvolution_padding(input, weights, static_cast<int32_t>(sx), static_cast<int32_t>(sy), out_dims);
+    padx = static_cast<uint32_t>(padxy.first);
+    pady = static_cast<uint32_t>(padxy.second);
+
     const DataLayout data_layout = input.data_layout();
     const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Find the upsampled dimensions
-    unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
-    unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
+    uint32_t out_x = (input.dimension(idx_w) - 1) * sx + 1;
+    uint32_t out_y = (input.dimension(idx_h) - 1) * sy + 1;
 
-    // Find the padding needed for the convolution with stride 1 in order to match output shape
-    padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
-    pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
     out_x += padx;
     out_y += pady;
 
@@ -516,10 +549,12 @@ inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &inpu
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, const ITensorInfo &input, const ITensorInfo &weights)
+inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+                                                      const ITensorInfo                           &input,
+                                                      const ITensorInfo                           &weights)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -527,7 +562,7 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const int        batch_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    TensorShape out_shape{ input_shape };
+    TensorShape out_shape{input_shape};
     out_shape.set(width_idx, out_dims.first);
     out_shape.set(height_idx, out_dims.second);
     out_shape.set(channel_idx, weights_shape[batch_idx]);
@@ -543,11 +578,18 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
  * @param[in] dilation        Dilation, in elements, across x and y
  * @param[in] batch_size_on_z True if batch size is on z axis
  * @param[in] num_groups      (Optional)  Number of groups when performing a grouped convolution
+ * @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
  *
  * @return the calculated shape
  */
-inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z,
-                                             unsigned int num_groups = 1)
+inline TensorShape compute_im2col_conv_shape(const ITensorInfo   *input,
+                                             const Size2D        &kernel_dims,
+                                             const PadStrideInfo &conv_info,
+                                             bool                 has_bias,
+                                             const Size2D        &dilation,
+                                             bool                 batch_size_on_z,
+                                             unsigned int         num_groups      = 1,
+                                             unsigned int         input_pad_right = 0)
 {
     // The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ]                           if batch_size_on_z == true
     //                       or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ]  if batch_size_on_z == false
@@ -556,17 +598,19 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && input->data_layout() != DataLayout::NCHW);
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && batch_size_on_z);
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    output_shape.set(0, (output_shape[channel_idx] / num_groups * kernel_dims.area() + (has_bias ? 1 : 0))); // NOLINT
+    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(
+        output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() +
+                         (has_bias ? 1 : 0))); // NOLINT
     output_shape.set(1, (out_dims.first * out_dims.second));
-    if(batch_size_on_z && output_shape.num_dimensions() >= 3)
+    if (batch_size_on_z && output_shape.num_dimensions() >= 3)
     {
         output_shape.remove_dimension(2);
     }
@@ -588,7 +632,7 @@ inline TensorShape compute_flatten_shape(const ITensorInfo *input)
 {
     // The output shape will be the flatten version of the input (i.e. [ width * height * channels, num_batches, ... ] ). Used for FlattenLayer and FullyConnectedLayer.
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     output_shape.collapse(3);
 
@@ -610,7 +654,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
     // - [x,y,z,w] and axis 3 will return [x*y*z, w]
     TensorShape shape2D = input->tensor_shape();
 
-    if(axis < input->num_dimensions())
+    if (axis < input->num_dimensions())
     {
         // Collapse from axis onward (this changes the shape)
         shape2D.collapse_from(axis);
@@ -624,7 +668,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
         shape2D.collapse(shape2D.num_dimensions());
     }
 
-    if(axis == 0)
+    if (axis == 0)
     {
         // If axis is zero the first dim should be one. Since
         // collapse is an inclusive operation we need to shift
@@ -643,15 +687,17 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
  */
 inline TensorShape compute_winograd_filter_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
 {
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     const Size2D kernel_size      = winograd_info.kernel_size;
     const Size2D output_tile_size = winograd_info.output_tile_size;
-    const Size2D input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     tensor_shape.remove_dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH));
     tensor_shape.set(Window::DimX, input.dimension(3));
-    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL)));
+    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(),
+                                                                                   DataLayoutDimension::CHANNEL)));
     tensor_shape.set(Window::DimZ, input_tile_size.area());
 
     return tensor_shape;
@@ -669,23 +715,22 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D        input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     const size_t idx_w = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles = compute_winograd_convolution_tiles(
+        Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]), kernel_size, output_tile_size, conv_info);
 
     const unsigned int width  = input.tensor_shape()[idx_c];
     const unsigned int height = num_tiles.area();
     const unsigned int depth  = input_tile_size.area();
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
     output_shape.set(0, width);
     output_shape.set(1, height);
     output_shape.set(2, depth);
@@ -708,12 +753,12 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
     const DataLayout    data_layout      = winograd_info.output_data_layout;
 
     // Compute output shape
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
+    unsigned int output_width             = 0;
+    unsigned int output_height            = 0;
     std::tie(output_width, output_height) = scaled_dimensions(input_dimensions.width, input_dimensions.height,
                                                               kernel_size.width, kernel_size.height, conv_info);
 
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     // Output dimension
     const unsigned int out_w = output_width;
@@ -729,20 +774,21 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
 
 /** Calculate the deep convolution shape output shape of a tensor
  *
- * @param[in] input     Input tensor info
- * @param[in] weights   Weights tensor info
- * @param[in] conv_info Contains padding and stride information
+ * @param[in] input_shape       Input tensor shape
+ * @param[in] input_data_layout Input data layout
+ * @param[in] weights_shape     Weights tensor shape
+ * @param[in] conv_info         Contains padding and stride information
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
+inline TensorShape compute_deep_convolution_shape(const TensorShape   &input_shape,
+                                                  DataLayout           input_data_layout,
+                                                  const TensorShape   &weights_shape,
+                                                  const PadStrideInfo &conv_info)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
-
-    const size_t idx_width   = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
+    const size_t idx_width   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_height  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_channel = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
 
     const unsigned int input_width         = input_shape[idx_width];
     const unsigned int input_height        = input_shape[idx_height];
@@ -751,9 +797,10 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
     const unsigned int weights_out_channel = weights_shape[3];
     unsigned int       output_width        = 0;
     unsigned int       output_height       = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(idx_width, output_width);
     output_shape.set(idx_height, output_height);
     output_shape.set(idx_channel, weights_out_channel);
@@ -761,6 +808,53 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
     return output_shape;
 }
 
+/** Calculate the deep convolution shape output shape of a tensor
+ *
+ * @param[in] input     Input tensor info
+ * @param[in] weights   Weights tensor info
+ * @param[in] conv_info Contains padding and stride information
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info)
+{
+    return compute_deep_convolution_shape(input.tensor_shape(), input.data_layout(), weights.tensor_shape(), conv_info);
+}
+
+/** Calculate the indirect buffer output shape used by the indirect convolution function
+ *
+ * @param[in] input_shape       Input tensor shape
+ * @param[in] input_data_layout Input data layout
+ * @param[in] weights_shape     Weights tensor shape
+ * @param[in] conv_info         Contains padding and stride information
+ * @param[in] desc              Contains the direct/indirect convolution compute arguments, such as the tiling dimensions
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_indirect_buffer_shape(const TensorShape                 &input_shape,
+                                                 DataLayout                         input_data_layout,
+                                                 const TensorShape                 &weights_shape,
+                                                 const PadStrideInfo               &conv_info,
+                                                 const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(input_data_layout != DataLayout::NHWC, "The data layout can only be NHWC");
+    ARM_COMPUTE_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+
+    const unsigned int m0 = desc.m0;
+    const unsigned int kw = weights_shape[1];
+    const unsigned int kh = weights_shape[2];
+
+    TensorShape output_conv2d_shape =
+        compute_deep_convolution_shape(input_shape, input_data_layout, weights_shape, conv_info);
+
+    const unsigned int output_w = m0 * kw * kh;
+    const unsigned int output_h = DIV_CEIL(output_conv2d_shape[1] * output_conv2d_shape[2], m0);
+    const unsigned int output_b = output_conv2d_shape[3];
+
+    return TensorShape(output_w, output_h, output_b);
+}
+
 /** Calculate the min/max shape output shape of a tensor
  *
  * @param[in] input Input tensor info
@@ -769,7 +863,7 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
  */
 inline TensorShape compute_min_max_shape(const ITensorInfo *input)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(Window::DimX, 2);
     output_shape.remove_dimension(1);
     output_shape.remove_dimension(1);
@@ -786,29 +880,63 @@ inline TensorShape compute_min_max_shape(const ITensorInfo *input)
  */
 inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo pool_info)
 {
-    unsigned int pooled_w = 0;
-    unsigned int pooled_h = 0;
+    int pooled_w = 0;
+    int pooled_h = 0;
+
+    TensorShape output_shape{input.tensor_shape()};
 
-    TensorShape output_shape{ input.tensor_shape() };
+    const bool is_global_pooling = pool_info.is_global_pooling;
+    const int  idx_width         = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
+    const int  idx_height        = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
+    const int  input_width       = input.tensor_shape()[idx_width];
+    const int  input_height      = input.tensor_shape()[idx_height];
+    const int  pool_size_x       = is_global_pooling ? output_shape[idx_width] : pool_info.pool_size.width;
+    const int  pool_size_y       = is_global_pooling ? output_shape[idx_height] : pool_info.pool_size.height;
 
-    const bool         is_global_pooling = pool_info.is_global_pooling;
-    const unsigned int idx_width         = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int idx_height        = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned int pool_size_x       = is_global_pooling ? output_shape[idx_width] : pool_info.pool_size.width;
-    const unsigned int pool_size_y       = is_global_pooling ? output_shape[idx_height] : pool_info.pool_size.height;
+    std::tie(pooled_w, pooled_h) =
+        scaled_dimensions_signed(input_width, input_height, pool_size_x, pool_size_y, pool_info.pad_stride_info);
 
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(output_shape[idx_width],
-                                                     output_shape[idx_height],
-                                                     pool_size_x,
-                                                     pool_size_y,
-                                                     pool_info.pad_stride_info);
+    ARM_COMPUTE_ERROR_ON_MSG((pooled_w < 1 || pooled_h < 1), "Calculated output dimension size is invalid");
 
-    output_shape.set(idx_width, pooled_w);
-    output_shape.set(idx_height, pooled_h);
+    output_shape.set(idx_width, static_cast<size_t>(pooled_w));
+    output_shape.set(idx_height, static_cast<size_t>(pooled_h));
 
     return output_shape;
 }
 
+/** Calculate the output unpool shape of a tensor
+ *
+ * @param[in] input     Input tensor info
+ * @param[in] pool_info Pooling layer info
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_unpool_shape(const ITensorInfo &input, PoolingLayerInfo pool_info)
+{
+    const unsigned int idx_width   = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int idx_height  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
+    const TensorShape  input_shape = input.tensor_shape();
+    ARM_COMPUTE_ERROR_ON(input_shape[idx_height] <= 1 || input_shape[idx_width] <= 1);
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    const unsigned int  stride_x        = pad_stride_info.stride().first;
+    const unsigned int  stride_y        = pad_stride_info.stride().second;
+
+    const int pad_left   = pad_stride_info.pad_left();
+    const int pad_top    = pad_stride_info.pad_top();
+    const int pad_right  = pad_stride_info.pad_right();
+    const int pad_bottom = pad_stride_info.pad_bottom();
+
+    TensorShape        output_shape = input_shape;
+    const unsigned int out_width =
+        (input_shape[idx_width] - 1) * stride_x - pad_left - pad_right + pool_info.pool_size.width;
+    const unsigned int out_height =
+        (input_shape[idx_height] - 1) * stride_y - pad_top - pad_bottom + pool_info.pool_size.height;
+
+    output_shape.set(idx_width, out_width);
+    output_shape.set(idx_height, out_height);
+    return output_shape;
+}
+
 /** Calculate the output roi align shape of a tensor
  *
  * @param[in] input     Input tensor info
@@ -817,9 +945,10 @@ inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
+inline TensorShape
+compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
 {
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     const unsigned int idx_width  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
@@ -840,7 +969,7 @@ inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITens
  */
 inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned int batch_size)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(1, batch_size);
 
     return output_shape;
@@ -855,15 +984,21 @@ inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned in
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+inline TensorShape compute_mm_shape(const ITensorInfo     &input0,
+                                    const ITensorInfo     &input1,
+                                    bool                   is_interleaved_transposed,
+                                    const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
-    ARM_COMPUTE_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
+    ARM_COMPUTE_ERROR_ON_MSG(
+        is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(),
+        "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
 
     const bool reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
     const bool reinterpret_output_as_3d = reshape_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? reshape_info.depth_output_gemm3d() : 1;
-    const int  m                        = reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
+    const int  m =
+        reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
@@ -872,7 +1007,7 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const int dim2 = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
     const int dim3 = reinterpret_input_as_3d ? 1 : input0.tensor_shape()[3];
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
     output_shape.set(0, dim0);
     output_shape.set(1, dim1);
@@ -885,15 +1020,14 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
 
 /** Calculate the matrix multiplication output shape of two tensors
  *
- * @note Deprecated. Remove when GEMMReshapeInfo is not used anymore by any other kernels
- *
  * @param[in] input0    First input tensor info
  * @param[in] input1    Second input tensor info
  * @param[in] gemm_info GEMM reshape info
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -902,9 +1036,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d() : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n());
         output_shape.set(1, gemm_info.m());
@@ -931,7 +1065,8 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -940,9 +1075,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool         reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     const unsigned int depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n);
         output_shape.set(1, gemm_info.m);
@@ -963,20 +1098,50 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
 
 /** Calculate the matrix multiplication output shape of two tensors
  *
+ * @param[in] input0      First input tensor info
+ * @param[in] input1      Second input tensor info
+ * @param[in] matmul_info Batch MatMul Kernel info to know which matrix is transposed
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_matmul_shape(const TensorShape &input0, const TensorShape &input1, const MatMulKernelInfo &matmul_info)
+{
+    TensorShape output_shape{input0};
+
+    if (matmul_info.adj_lhs)
+    {
+        output_shape.set(1, input0[0]); // The vertical (M) dimension
+    }
+
+    if (matmul_info.adj_rhs)
+    {
+        output_shape.set(0, input1[1]); // The horizontal (N) dimension
+    }
+    else
+    {
+        output_shape.set(0, input1[0]); // The horizontal (N) dimension
+    }
+
+    return output_shape;
+}
+/** Calculate the matrix multiplication output shape of two tensors
+ *
  * @param[in] input           Input tensor info
  * @param[in] gemm_3d_depth   (Optional)  GEMM 3d depth
  * @param[in] batch_size_on_z (Optional) True if batch size is on z axis
  *
  * @return the calculated shape
  */
-inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
+inline TensorShape
+compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
 {
     ARM_COMPUTE_ERROR_ON(input.data_layout() != DataLayout::NHWC && gemm_3d_depth > 1);
 
     TensorShape output_shape = input.tensor_shape();
-    if(gemm_3d_depth > 1)
+    if (gemm_3d_depth > 1)
     {
-        if(batch_size_on_z)
+        if (batch_size_on_z)
         {
             output_shape.shift_right(1);
         }
@@ -1001,11 +1166,16 @@ inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned
  * @return the calculated shape
  */
 inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
-                                               const Coordinates &starts, const Coordinates &ends, const Coordinates &strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                               const Coordinates &starts,
+                                               const Coordinates &ends,
+                                               const Coordinates &strides,
+                                               int32_t            begin_mask,
+                                               int32_t            end_mask,
+                                               int32_t            shrink_axis_mask)
 {
     using namespace arm_compute::helpers::tensor_transform;
-    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask,
+                                              shrink_axis_mask);
 }
 
 /** Calculate the slice output shape of a tensor
@@ -1016,60 +1186,72 @@ inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
+inline TensorShape
+compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
-    return compute_strided_slice_output_shape(input_shape,
-                                              starts, ends, BiStrides(),
-                                              0, construct_slice_end_mask(ends), 0);
+    return compute_strided_slice_output_shape(input_shape, starts, ends, BiStrides(), 0, construct_slice_end_mask(ends),
+                                              0);
 }
 
 /** Calculate the batch to space output shape of a tensor
  *
- * @param[in] input   Input tensor info
- * @param[in] block_x Block shape x value
- * @param[in] block_y Block shape y value
+ * @param[in] data_layout Data layout
+ * @param[in] input       Input tensor shape
+ * @param[in] block_x     Block shape x value
+ * @param[in] block_y     Block shape y value
+ * @param[in] crop_info   Information about how the output shape is cropped after batch to space is performed
  *
  * @return the calculated shape
  */
-inline TensorShape compute_batch_to_space_shape(const ITensorInfo *input, const int block_x, const int block_y)
+inline TensorShape compute_batch_to_space_shape(
+    DataLayout data_layout, const TensorShape &input, int block_x, int block_y, const CropInfo &crop_info = CropInfo{})
 {
-    ARM_COMPUTE_ERROR_ON(block_x <= 0 || block_y <= 0);
+    ARM_COMPUTE_ERROR_ON(block_x < 1 || block_y < 1);
 
-    const DataLayout data_layout = input->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_batch  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    TensorShape output_shape{input};
+
+    unsigned int       new_width   = input[idx_width] * static_cast<unsigned int>(block_x);
+    unsigned int       new_height  = input[idx_height] * static_cast<unsigned int>(block_y);
+    const unsigned int width_crop  = crop_info.left + crop_info.right;
+    const unsigned int height_crop = crop_info.top + crop_info.bottom;
+    ARM_COMPUTE_ERROR_ON(new_width <= width_crop);
+    ARM_COMPUTE_ERROR_ON(new_height <= height_crop);
+    new_width -= width_crop;
+    new_height -= height_crop;
 
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_x);
-    output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_y);
-    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] / (block_x * block_y));
+    output_shape.set(idx_width, new_width);
+    output_shape.set(idx_height, new_height);
+    output_shape.set(idx_batch, input[idx_batch] / (block_x * block_y));
 
     return output_shape;
 }
 
 /** Calculate the depth to space output shape of a tensor
  *
- * @param[in] input Input tensor info
- * @param[in] block Block shape value
+ * @param[in] input_shape Input tensor shape
+ * @param[in] data_layout Operation data layout
+ * @param[in] block       Block shape value
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depth_to_space_shape(const ITensorInfo *input, int block)
+inline TensorShape compute_depth_to_space_shape(const TensorShape &input_shape, DataLayout data_layout, int block)
 {
     ARM_COMPUTE_ERROR_ON(block < 2);
 
-    const DataLayout data_layout = input->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(idx_width, input->dimension(idx_width) * block);
-    output_shape.set(idx_height, input->dimension(idx_height) * block);
-    output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block));
+    TensorShape output_shape{input_shape};
+    output_shape.set(idx_width, input_shape[idx_width] * block);
+    output_shape.set(idx_height, input_shape[idx_height] * block);
+    output_shape.set(idx_channel, input_shape[idx_channel] / (block * block));
 
     return output_shape;
 }
@@ -1087,10 +1269,10 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     TensorShape empty_shape;
     empty_shape.set(0, 0);
 
-    TensorShape out_shape{ input->tensor_shape() };
+    TensorShape out_shape{input->tensor_shape()};
 
     // Return empty shape if axis is invalid
-    if(axis > input->tensor_shape().num_dimensions())
+    if (axis > input->tensor_shape().num_dimensions())
     {
         return empty_shape;
     }
@@ -1098,7 +1280,7 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     size_t axis_size = out_shape[axis];
 
     // Return empty shape if num_split is not valid
-    if(axis_size % num_splits)
+    if (axis_size % num_splits)
     {
         return empty_shape;
     }
@@ -1117,18 +1299,22 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
  *
  * @return the calculated shape
  */
-inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, const int block_x, const int block_y, const Size2D &padding_left, const Size2D &padding_right)
+inline TensorShape compute_space_to_batch_shape(
+    const ITensorInfo *input, int block_x, int block_y, const Size2D &padding_left, const Size2D &padding_right)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_x + padding_left.x() + padding_right.x());
-    output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_y + padding_left.y() + padding_right.y());
-    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] / (block_x * block_y));
+    ARM_COMPUTE_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_x != 0);
+    ARM_COMPUTE_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_y != 0);
+
+    output_shape.set(idx_width, (input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) / block_x);
+    output_shape.set(idx_height, (input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) / block_y);
+    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] * block_x * block_y);
 
     return output_shape;
 }
@@ -1142,16 +1328,16 @@ inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, const
  */
 inline TensorShape compute_space_to_depth_shape(const ITensorInfo *input, int32_t block_shape)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape);
-    output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape);
-    output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape));
+    output_shape.set(idx_width, input->tensor_shape()[idx_width] / block_shape);
+    output_shape.set(idx_height, input->tensor_shape()[idx_height] / block_shape);
+    output_shape.set(idx_depth, input->tensor_shape()[idx_depth] * (block_shape * block_shape));
 
     return output_shape;
 }
@@ -1187,7 +1373,7 @@ inline TensorShape compute_prior_box_shape(const ITensorInfo &input, const Prior
 inline TensorShape compute_padded_shape(const TensorShape &input_shape, const PaddingList &padding)
 {
     TensorShape padded_shape = input_shape;
-    for(size_t dim = 0; dim < padding.size(); ++dim)
+    for (size_t dim = 0; dim < padding.size(); ++dim)
     {
         const auto    &padding_pair   = padding[dim];
         const uint32_t shape_on_index = (padded_shape.num_dimensions() <= dim) ? 1 : input_shape[dim];
@@ -1206,7 +1392,7 @@ inline TensorShape compute_padded_shape(const TensorShape &input_shape, const Pa
 inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Multiples &multiples)
 {
     TensorShape tiled_shape = input_shape;
-    for(size_t dim = 0; dim < multiples.size(); ++dim)
+    for (size_t dim = 0; dim < multiples.size(); ++dim)
     {
         tiled_shape.set(dim, input_shape[dim] * multiples[dim]);
     }
@@ -1223,9 +1409,9 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
  */
 inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims = true)
 {
-    TensorShape output_shape{ input };
+    TensorShape output_shape{input};
 
-    if(!keep_dims)
+    if (!keep_dims)
     {
         output_shape.remove_dimension(axis);
     }
@@ -1318,14 +1504,14 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // All dimensions must match except the axis one
-    for(unsigned int i = 0; i < MAX_DIMS; ++i)
+    for (unsigned int i = 0; i < MAX_DIMS; ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             continue;
         }
 
-        for(const auto &tensor : input)
+        for (const auto &tensor : input)
         {
             ARM_COMPUTE_ERROR_ON(tensor == nullptr);
             const TensorShape shape = extract_shape(tensor);
@@ -1336,7 +1522,7 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
     // Calculate output shape
     size_t new_size = 0;
-    for(const auto &tensor : input)
+    for (const auto &tensor : input)
     {
         const TensorShape shape = extract_shape(tensor);
         new_size += shape[axis];
@@ -1359,14 +1545,14 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
     ARM_COMPUTE_ERROR_ON(axis > a.num_dimensions());
     ARM_COMPUTE_ERROR_ON(a.num_dimensions() > 4);
 
-    TensorShape shape_out{ a.tensor_shape() };
+    TensorShape shape_out{a.tensor_shape()};
     shape_out.set(axis, num_tensors);
 
     unsigned int i_shift = 0;
 
-    for(unsigned int i = 0; i < a.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < a.num_dimensions(); ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             i_shift++;
         }
@@ -1376,18 +1562,177 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
     return shape_out;
 }
 
-inline TensorShape compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
+/** Calculate the output shape of 3d Convolution
+ *
+ * @param[in] src         Input tensor shape
+ * @param[in] weights     Weights tensor shape
+ * @param[in] conv3d_info 3d Convolution Parameters object
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_conv3d_shape(const TensorShape &src, const TensorShape &weights, const Conv3dInfo &conv3d_info)
+{
+    // Weight tensor shape indices (D H W Cin Cout)
+    constexpr unsigned int weights_depth_dim  = 4u;
+    constexpr unsigned int weights_height_dim = 3u;
+    constexpr unsigned int weights_width_dim  = 2u;
+    constexpr unsigned int weights_CHout_dim  = 0u;
+
+    // Source/Destination Tensor shape indices (N D H W C)
+    constexpr unsigned int batch_dim   = 4u;
+    constexpr unsigned int depth_dim   = 3u;
+    constexpr unsigned int height_dim  = 2u;
+    constexpr unsigned int width_dim   = 1u;
+    constexpr unsigned int channel_dim = 0u;
+
+    TensorShape  output_shape{src};
+    const size_t pad_left   = conv3d_info.padding.left;
+    const size_t pad_right  = conv3d_info.padding.right;
+    const size_t pad_top    = conv3d_info.padding.top;
+    const size_t pad_bottom = conv3d_info.padding.bottom;
+    const size_t pad_front  = conv3d_info.padding.front;
+    const size_t pad_back   = conv3d_info.padding.back;
+    const size_t dilation_x = conv3d_info.dilation.width;
+    const size_t dilation_y = conv3d_info.dilation.height;
+    const size_t dilation_z = conv3d_info.dilation.depth;
+    const size_t stride_x   = conv3d_info.stride.x();
+    const size_t stride_y   = conv3d_info.stride.y();
+    const size_t stride_z   = conv3d_info.stride.z();
+
+    int output_width_size  = 0;
+    int output_height_size = 0;
+    int output_depth_size  = 0;
+
+    switch (conv3d_info.round_type)
+    {
+        case DimensionRoundingType::FLOOR:
+            output_width_size =
+                static_cast<int>(std::floor((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                                (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                             stride_x) +
+                                            1));
+            output_height_size =
+                static_cast<int>(std::floor((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                                (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                             stride_y) +
+                                            1));
+            output_depth_size =
+                static_cast<int>(std::floor((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                                (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                             stride_z) +
+                                            1));
+            break;
+        case DimensionRoundingType::CEIL:
+            output_width_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                               (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                            stride_x) +
+                                           1));
+            output_height_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                               (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                            stride_y) +
+                                           1));
+            output_depth_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                               (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                            stride_z) +
+                                           1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported rounding type");
+    }
+
+    output_shape.set(batch_dim, src[batch_dim]);
+    output_shape.set(width_dim, output_width_size);
+    output_shape.set(height_dim, output_height_size);
+    output_shape.set(depth_dim, output_depth_size);
+    output_shape.set(channel_dim, weights[weights_CHout_dim]);
+    return output_shape;
+}
+
+/** Calculate the output pool3d shape of a tensor
+ *
+ * @param[in] src         Input tensor info
+ * @param[in] pool3d_info Pooling layer info
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerInfo pool3d_info)
+{
+    TensorShape output_shape{src};
+
+    const auto data_layout      = DataLayout::NDHWC;
+    const int  idx_width        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+    const int  pool_size_width  = pool3d_info.is_global_pooling ? src[idx_width] : pool3d_info.pool_size.width;
+    const int  pool_size_height = pool3d_info.is_global_pooling ? src[idx_height] : pool3d_info.pool_size.height;
+    const int  pool_size_depth  = pool3d_info.is_global_pooling ? src[idx_depth] : pool3d_info.pool_size.depth;
+    int        output_width     = 0;
+    int        output_height    = 0;
+    int        output_depth     = 0;
+
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src[idx_width], src[idx_height], src[idx_depth], pool_size_width, pool_size_height,
+                                    pool_size_depth, pool3d_info);
+
+    ARM_COMPUTE_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                             "Calculated output dimension size is invalid");
+
+    output_shape.set(idx_width, static_cast<size_t>(output_width));
+    output_shape.set(idx_height, static_cast<size_t>(output_height));
+    output_shape.set(idx_depth, static_cast<size_t>(output_depth));
+
+    return output_shape;
+}
+
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] indices_shape Indices tensor shape. Only supports for 2d and 3d indices
+ * @param[in] actual_axis   Axis to be used in the computation
+ *
+ * @note Let input_shape be (X,Y,Z) and indices shape (W,O,P) and axis 1
+ *       the new shape is computed by replacing the axis in the input shape with
+ *       the indice shape so the output shape will be (X,W,O,P,Z)
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
 {
-    ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 1);
-    ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
+    const auto input_num_dims   = input_shape.num_dimensions();
+    const auto indices_num_dims = indices_shape.num_dimensions();
+
+    ARM_COMPUTE_ERROR_ON(actual_axis >= input_num_dims);
+    ARM_COMPUTE_ERROR_ON(input_num_dims + indices_num_dims - 1 > Coordinates::num_max_dimensions);
+
+    TensorShape output_shape;
+    size_t      dim_no = 0;
+
+    for (; dim_no < actual_axis; ++dim_no)
+    {
+        output_shape.set(dim_no, input_shape[dim_no]);
+    }
+
+    for (; dim_no < actual_axis + indices_num_dims; ++dim_no)
+    {
+        output_shape.set(dim_no, indices_shape[dim_no - actual_axis]);
+    }
+
+    for (; dim_no < input_num_dims + indices_num_dims - 1; ++dim_no)
+    {
+        output_shape.set(dim_no, input_shape[dim_no + 1 - indices_num_dims]);
+    }
 
-    TensorShape output_shape  = input_shape;
-    output_shape[actual_axis] = indices_shape[0];
+    ARM_COMPUTE_ERROR_ON(input_shape.total_size() * indices_shape.total_size() !=
+                         output_shape.total_size() * input_shape[actual_axis]);
 
     return output_shape;
 }
 } // namespace shape_calculator
 } // namespace misc
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H */
+#endif // ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
diff --git a/arm_compute/core/utils/misc/Traits.h b/arm_compute/core/utils/misc/Traits.h
index 1cbdbfe16f..944fcb95f9 100644
--- a/arm_compute/core/utils/misc/Traits.h
+++ b/arm_compute/core/utils/misc/Traits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 #define ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 
+#include "arm_compute/core/Types.h"
+
 #include <type_traits>
 
 namespace arm_compute
diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index b2bb63f5c8..22f10d74cc 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,11 @@
 #ifndef ARM_COMPUTE_MISC_UTILITY_H
 #define ARM_COMPUTE_MISC_UTILITY_H
 
+#include "arm_compute/core/Error.h"
+
 #include <algorithm>
 #include <array>
+#include <cstdint>
 #include <limits>
 #include <numeric>
 #include <vector>
@@ -41,7 +44,7 @@ struct index_sequence
 };
 
 template <std::size_t N, std::size_t... S>
-struct index_sequence_generator : index_sequence_generator < N - 1, N - 1, S... >
+struct index_sequence_generator : index_sequence_generator<N - 1, N - 1, S...>
 {
 };
 
@@ -55,17 +58,17 @@ template <std::size_t N>
 using index_sequence_t = typename index_sequence_generator<N>::type;
 
 template <typename T, std::size_t N, T val, T... vals>
-struct generate_array : generate_array < T, N - 1, val, val, vals... >
+struct generate_array : generate_array<T, N - 1, val, val, vals...>
 {
 };
 
 template <typename T, T val, T... vals>
 struct generate_array<T, 0, val, vals...>
 {
-    static constexpr std::array<T, sizeof...(vals)> value{ vals... };
+    static constexpr std::array<T, sizeof...(vals)> value{vals...};
 };
 
-template <typename T, T                  val, T... vals>
+template <typename T, T val, T... vals>
 constexpr std::array<T, sizeof...(vals)> generate_array<T, 0, val, vals...>::value;
 /** @endcond */
 
@@ -76,7 +79,7 @@ template <std::size_t... S,
           typename T = std::array<typename std::iterator_traits<Iterator>::value_type, sizeof...(S)>>
 T make_array(Iterator first, index_sequence<S...>)
 {
-    return T{ { first[S]... } };
+    return T{{first[S]...}};
 }
 } // namespace detail
 
@@ -84,7 +87,7 @@ template <std::size_t N, typename Iterator>
 std::array<typename std::iterator_traits<Iterator>::value_type, N> make_array(Iterator first, Iterator last)
 {
     ARM_COMPUTE_UNUSED(last);
-    return detail::make_array(first, index_sequence_t<N> {});
+    return detail::make_array(first, index_sequence_t<N>{});
 }
 
 /** Performs clamping among a lower and upper value.
@@ -116,7 +119,7 @@ inline void for_each(F &&)
  * @param[in] args Remaining arguments
  */
 template <typename F, typename T, typename... Ts>
-inline void for_each(F &&func, T &&arg, Ts &&... args)
+inline void for_each(F &&func, T &&arg, Ts &&...args)
 {
     func(std::forward<T>(arg));
     for_each(std::forward<F>(func), std::forward<Ts>(args)...);
@@ -140,9 +143,11 @@ inline T &&foldl(F &&, T &&value)
  * @param[in] values  Remaining arguments
  */
 template <typename F, typename T, typename U, typename... Us>
-inline auto foldl(F &&func, T &&initial, U &&value, Us &&... values) -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
+inline auto foldl(F &&func, T &&initial, U &&value, Us &&...values)
+    -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
 {
-    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)), std::forward<Us>(values)...);
+    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)),
+                 std::forward<Us>(values)...);
 }
 
 /** Perform an index sort of a given vector.
@@ -157,11 +162,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
     std::vector<size_t> idx(v.size());
     std::iota(idx.begin(), idx.end(), 0);
 
-    std::sort(idx.begin(), idx.end(),
-              [&v](size_t i1, size_t i2)
-    {
-        return v[i1] < v[i2];
-    });
+    std::sort(idx.begin(), idx.end(), [&v](size_t i1, size_t i2) { return v[i1] < v[i2]; });
 
     return idx;
 }
@@ -175,7 +176,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
  */
 inline bool endswith(const std::string &str, const std::string &suffix)
 {
-    if(str.size() < suffix.size())
+    if (str.size() < suffix.size())
     {
         return false;
     }
@@ -202,12 +203,28 @@ inline bool check_aligned(void *ptr, const size_t alignment)
  */
 inline std::string tolower(std::string string)
 {
-    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c)
-    {
-        return std::tolower(c);
-    });
+    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c) { return std::tolower(c); });
     return string;
 }
+
+/** Get environment variable as a string
+ *
+ * @note Return empty string on bare-metal
+ *
+ * @param[in] env_name Name of the Environment variable to retrieve
+ *
+ * @return Environment variable content, or empty string if the variable is undefined or on bare-metal
+ */
+inline std::string getenv(const std::string &env_name)
+{
+#ifdef BARE_METAL
+    ARM_COMPUTE_UNUSED(env_name);
+    return std::string{};
+#else  // BARE_METAL
+    const auto env_chr = std::getenv(env_name.c_str());
+    return env_chr == nullptr ? std::string{} : std::string{env_chr};
+#endif // BARE_METAL
+}
 } // namespace utility
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_MISC_UTILITY_H */
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 4ef49476b2..2324fe1838 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,10 @@ namespace quantization
  *
  * @return a status
  */
-Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier(float    multiplier,
+                                      int32_t *quant_multiplier,
+                                      int32_t *shift,
+                                      bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier with value less than one.
  *
  * @param[in]  multiplier       Real multiplier.
@@ -51,7 +54,10 @@ Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplie
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier_less_than_one(float    multiplier,
+                                                    int32_t *quant_multiplier,
+                                                    int32_t *right_shift,
+                                                    bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier having value greater than one.
  *
  * @param[in]  multiplier           Real multiplier.
@@ -60,7 +66,8 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *q
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
 
 /** Calculate quantized representation of per-channel multipliers
  *
@@ -71,9 +78,9 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t
  *
  * @return a status
  */
-Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                       const QuantizationInfo &wq_info,
-                                       const QuantizationInfo &oq_info,
+Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                       const QuantizationInfo  &wq_info,
+                                       const QuantizationInfo  &oq_info,
                                        GEMMLowpOutputStageInfo &stage_info);
 
 /** Get minimum and maximum values for the input quantized data type
@@ -81,6 +88,7 @@ Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
  * @return min and max values for the quantized data type
  */
 std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_type);
+
 /** Compute quantized per-channel multipliers and shifts. As many multipliers
  * and shifts as output channels are computed. If weights are not quantized
  * per-channel, multipliers and shifts will end up being the same for each
@@ -89,16 +97,12 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
  * @param[in]  input                  Input tensor info.
  * @param[in]  weights                Weights tensor info.
  * @param[in]  output                 Output tensor info.
- * @param[in]  idx_ofms               Dimension index to get OFMs from the weights tensor.
  * @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers.
  * @param[out] output_shifts_ptr      Pointer to the buffer where to store per-channel shifts.
- *
- * @return min and max values for the quantized data type
  */
 void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
                                               const ITensorInfo *weights,
                                               const ITensorInfo *output,
-                                              unsigned int       idx_ofms,
                                               int32_t           *output_multipliers_ptr,
                                               int32_t           *output_shifts_ptr);
 
@@ -150,7 +154,10 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v);
  * @param[out] output_shift    Shift for inverse square root
  *
  */
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift);
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift);
 
 } // namespace quantization
 } // namespace arm_compute